diff --git a/go.mod b/go.mod index 327920585c..9008648443 100644 --- a/go.mod +++ b/go.mod @@ -9,17 +9,21 @@ require ( github.com/GeertJohan/go.rice v1.0.0 github.com/PuerkitoBio/goquery v1.5.1 github.com/TylerBrock/colorjson v0.0.0-20180527164720-95ec53f28296 - github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 // indirect + github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6 + github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 github.com/aws/aws-sdk-go v1.28.8 github.com/buger/jsonparser v0.0.0-20200322175846-f7e751efca13 github.com/cespare/xxhash/v2 v2.1.1 github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd // indirect + github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0 // indirect github.com/coreos/bbolt v1.3.2 // indirect github.com/coreos/etcd v3.3.10+incompatible github.com/coreos/go-systemd v0.0.0-20190719114852-fd7a80b32e1f // indirect github.com/coreos/pkg v0.0.0-20180928190104-399ea9e2e55f // indirect github.com/corpix/uarand v0.1.1 // indirect + github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 github.com/evanphx/json-patch v4.5.0+incompatible + github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab github.com/go-sql-driver/mysql v1.5.0 github.com/gogo/protobuf v1.3.1 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b @@ -34,11 +38,12 @@ require ( github.com/grpc-ecosystem/go-grpc-prometheus v1.2.0 github.com/hashicorp/consul/api v1.5.0 github.com/hashicorp/go-immutable-radix v1.1.0 // indirect - github.com/hashicorp/go-msgpack v0.5.5 // indirect + github.com/hashicorp/go-msgpack v0.5.5 github.com/hashicorp/go-sockaddr v1.0.2 // indirect github.com/hashicorp/go-uuid v1.0.2 // indirect github.com/hashicorp/golang-lru v0.5.3 // indirect github.com/hashicorp/serf v0.9.2 // indirect + github.com/howeyc/gopass v0.0.0-20190910152052-7cb4b85ec19c github.com/icrowley/fake v0.0.0-20180203215853-4178557ae428 github.com/imdario/mergo v0.3.6 // indirect github.com/klauspost/compress v1.4.1 // indirect @@ -48,22 +53,32 @@ require ( github.com/krishicks/yaml-patch v0.0.10 github.com/magiconair/properties v1.8.1 github.com/manifoldco/promptui v0.7.0 + github.com/martini-contrib/auth v0.0.0-20150219114609-fa62c19b7ae8 + github.com/martini-contrib/gzip v0.0.0-20151124214156-6c035326b43f + github.com/martini-contrib/render v0.0.0-20150707142108-ec18f8345a11 + github.com/mattn/go-sqlite3 v1.14.0 github.com/minio/minio-go v0.0.0-20190131015406-c8a261de75c1 github.com/mitchellh/go-ps v1.0.0 // indirect github.com/mitchellh/go-testing-interface v1.14.0 // indirect github.com/mitchellh/mapstructure v1.2.3 // indirect + github.com/montanaflynn/stats v0.6.3 github.com/olekukonko/tablewriter v0.0.5-0.20200416053754-163badb3bac6 github.com/onsi/ginkgo v1.10.3 // indirect github.com/onsi/gomega v1.7.1 // indirect github.com/opentracing-contrib/go-grpc v0.0.0-20180928155321-4b5a12d3ff02 github.com/opentracing/opentracing-go v1.1.0 + github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c // indirect + github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pborman/uuid v1.2.0 github.com/philhofer/fwd v1.0.0 // indirect github.com/pires/go-proxyproto v0.0.0-20191211124218-517ecdf5bb2b github.com/pkg/errors v0.8.1 github.com/prometheus/client_golang v1.4.1 github.com/prometheus/common v0.9.1 + github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 + github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e github.com/satori/go.uuid v1.2.0 // indirect + github.com/sjmudd/stopwatch v0.0.0-20170613150411-f380bf8a9be1 github.com/smartystreets/goconvey v1.6.4 // indirect github.com/spf13/cobra v0.0.5 github.com/stretchr/testify v1.4.0 @@ -78,7 +93,7 @@ require ( github.com/z-division/go-zookeeper v0.0.0-20190128072838-6d7457066b9b golang.org/x/crypto v0.0.0-20200220183623-bac4c82f6975 golang.org/x/lint v0.0.0-20190409202823-959b441ac422 - golang.org/x/net v0.0.0-20200202094626-16171245cfb2 + golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 golang.org/x/sync v0.0.0-20190911185100-cd5d95a43a6e golang.org/x/text v0.3.2 @@ -89,8 +104,10 @@ require ( google.golang.org/grpc v1.24.0 gopkg.in/DataDog/dd-trace-go.v1 v1.17.0 gopkg.in/asn1-ber.v1 v1.0.0-20181015200546-f715ec2f112d // indirect + gopkg.in/gcfg.v1 v1.2.3 gopkg.in/ini.v1 v1.51.0 // indirect gopkg.in/ldap.v2 v2.5.0 + gopkg.in/warnings.v0 v0.1.2 // indirect gotest.tools v2.2.0+incompatible honnef.co/go/tools v0.0.1-2019.2.3 k8s.io/apiextensions-apiserver v0.17.3 diff --git a/go.sum b/go.sum index d017a2f9d3..10e55d0e67 100644 --- a/go.sum +++ b/go.sum @@ -70,6 +70,7 @@ github.com/andybalholm/cascadia v1.1.0 h1:BuuO6sSfQNFRu1LppgbD25Hr2vLYW25JvxHs5z github.com/andybalholm/cascadia v1.1.0/go.mod h1:GsXiBklL0woXo1j/WYWtSYYC4ouU9PqHO0sqidkEA4Y= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e h1:QEF07wC0T1rKkctt1RINW/+RMTVmiwxETico2l3gxJA= github.com/armon/circbuf v0.0.0-20150827004946-bbbad097214e/go.mod h1:3U/XgcO3hCbHZ8TKRvWD2dDTCfh9M9ya+I9JpbB7O8o= +github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6 h1:G1bPvciwNyF7IUmKXNt9Ak3m6u9DE1rF+RmtIkBpVdA= github.com/armon/consul-api v0.0.0-20180202201655-eb2c6b5be1b6/go.mod h1:grANhF5doyWs3UAsr3K4I6qtAmlQcZDesFNEHPZAzj8= github.com/armon/go-metrics v0.0.0-20180917152333-f0300d1749da/go.mod h1:Q73ZrmVTwzkszR9V5SSuryQ31EELlFMUz1kKyl939pY= github.com/armon/go-metrics v0.0.0-20190430140413-ec5e00d3c878 h1:EFSB7Zo9Eg91v7MJPVsifUysc/wPdN+NOnVe6bWbdBM= @@ -110,6 +111,8 @@ github.com/cockroachdb/datadriven v0.0.0-20190809214429-80d97fb3cbaa/go.mod h1:z github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd h1:qMd81Ts1T2OTKmB4acZcyKaMtRnY5Y44NuXGX2GFJ1w= github.com/codahale/hdrhistogram v0.0.0-20161010025455-3a0bb77429bd/go.mod h1:sE/e/2PUdi/liOCUjSTXgM1o87ZssimdTWN964YiIeI= github.com/codegangsta/cli v1.20.0/go.mod h1:/qJNoX69yVSKu5o4jLyXAENLRyk1uhi7zkbQ3slBdOA= +github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0 h1:sDMmm+q/3+BukdIpxwO365v/Rbspp2Nt5XntgQRXq8Q= +github.com/codegangsta/inject v0.0.0-20150114235600-33e0aa1cb7c0/go.mod h1:4Zcjuz89kmFXt9morQgcfYZAYZ5n8WHjt81YYWIwtTM= github.com/coreos/bbolt v1.3.2 h1:wZwiHHUieZCquLkDL0B8UhzreNWsPHooDAG3q34zk0s= github.com/coreos/bbolt v1.3.2/go.mod h1:iRUV2dpdMOn7Bo10OQBFzIJO9kkE559Wcmn+qkEiiKk= github.com/coreos/etcd v3.3.10+incompatible h1:jFneRYjIvLMLhDLCzuTuU4rSJUjRplcJQ7pD7MnhC04= @@ -132,6 +135,8 @@ github.com/corpix/uarand v0.1.1 h1:RMr1TWc9F4n5jiPDzFHtmaUXLKLNUFK0SgCLo4BhX/U= github.com/corpix/uarand v0.1.1/go.mod h1:SFKZvkcRoLqVRFZ4u25xPmp6m9ktANfbpXZ7SJ0/FNU= github.com/cpuguy83/go-md2man v1.0.10/go.mod h1:SmD6nW6nTyfqj6ABTjUi3V3JVMnlJmwcJI5acqYI6dE= github.com/creack/pty v1.1.7/go.mod h1:lj5s0c3V2DBrqTV7llrYr5NG6My20zk30Fl46Y7DoTY= +github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432 h1:M5QgkYacWj0Xs8MhpIK/5uwU02icXpEoSo9sM2aRCps= +github.com/cyberdelia/go-metrics-graphite v0.0.0-20161219230853-39f87cc3b432/go.mod h1:xwIwAxMvYnVrGJPe2FKx5prTrnAjGOD8zvDOnxnrrkM= github.com/daaku/go.zipexe v1.0.0 h1:VSOgZtH418pH9L16hC/JrgSNJbbAL26pj7lmD1+CGdY= github.com/daaku/go.zipexe v1.0.0/go.mod h1:z8IiR6TsVLEYKwXAoE/I+8ys/sDkgTzSL0CLnGVd57E= github.com/davecgh/go-spew v0.0.0-20151105211317-5215b55f46b2/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= @@ -168,6 +173,8 @@ github.com/go-kit/kit v0.9.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2 github.com/go-logfmt/logfmt v0.3.0/go.mod h1:Qt1PoO58o5twSAckw1HlFXLmHsOX5/0LbT9GBnD5lWE= github.com/go-logfmt/logfmt v0.4.0/go.mod h1:3RMwSq7FuexP4Kalkev3ejPJsZTpXXBr9+V4qmtdjCk= github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas= +github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab h1:xveKWz2iaueeTaUgdetzel+U7exyigDYBryyVfV/rZk= +github.com/go-martini/martini v0.0.0-20170121215854-22fa46961aab/go.mod h1:/P9AEU963A2AYjv4d1V5eVL1CQbEJq6aCNHDDjibzu8= github.com/go-openapi/analysis v0.0.0-20180825180245-b006789cd277/go.mod h1:k70tL6pCuVxPJOHXQ+wIac1FUrvNkHolPie/cLEU6hI= github.com/go-openapi/analysis v0.17.0/go.mod h1:IowGgpVeD0vNm45So8nr+IcQ3pxVtpRoBWb8PVZO0ik= github.com/go-openapi/analysis v0.18.0/go.mod h1:IowGgpVeD0vNm45So8nr+IcQ3pxVtpRoBWb8PVZO0ik= @@ -334,6 +341,8 @@ github.com/hashicorp/memberlist v0.2.2/go.mod h1:MS2lj3INKhZjWNqd3N0m3J+Jxf3DAOn github.com/hashicorp/serf v0.9.0/go.mod h1:YL0HO+FifKOW2u1ke99DGVu1zhcpZzNwrLIqBC7vbYU= github.com/hashicorp/serf v0.9.2 h1:yJoyfZXo4Pk2p/M/viW+YLibBFiIbKoP79gu7kDAFP0= github.com/hashicorp/serf v0.9.2/go.mod h1:UWDWwZeL5cuWDJdl0C6wrvrUwEqtQ4ZKBKKENpqIUyk= +github.com/howeyc/gopass v0.0.0-20190910152052-7cb4b85ec19c h1:aY2hhxLhjEAbfXOx2nRJxCXezC6CO2V/yN+OCr1srtk= +github.com/howeyc/gopass v0.0.0-20190910152052-7cb4b85ec19c/go.mod h1:lADxMC39cJJqL93Duh1xhAs4I2Zs8mKS89XWXFGp9cs= github.com/hpcloud/tail v1.0.0 h1:nfCOvKYfkgYP8hkirhJocXT2+zOD8yUNjXaWfTlyFKI= github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpOxQnU= github.com/icrowley/fake v0.0.0-20180203215853-4178557ae428 h1:Mo9W14pwbO9VfRe+ygqZ8dFbPpoIK1HFrG/zjTuQ+nc= @@ -402,6 +411,12 @@ github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN github.com/mailru/easyjson v0.7.0/go.mod h1:KAzv3t3aY1NaHWoQz1+4F1ccyAH66Jk7yos7ldAVICs= github.com/manifoldco/promptui v0.7.0 h1:3l11YT8tm9MnwGFQ4kETwkzpAwY2Jt9lCrumCUW4+z4= github.com/manifoldco/promptui v0.7.0/go.mod h1:n4zTdgP0vr0S3w7/O/g98U+e0gwLScEXGwov2nIKuGQ= +github.com/martini-contrib/auth v0.0.0-20150219114609-fa62c19b7ae8 h1:1ded5x5QpCLsyTH5ct62Rh1RXPFnn0/dubCqAeh+stU= +github.com/martini-contrib/auth v0.0.0-20150219114609-fa62c19b7ae8/go.mod h1:ahTFgV/NtzY/CALneRrC67m1dis5arHTQDfyIhKk69E= +github.com/martini-contrib/gzip v0.0.0-20151124214156-6c035326b43f h1:wVDxEVZP1eiPIlHVaafUAEUDtyl6ytjHv3egJVbyfOk= +github.com/martini-contrib/gzip v0.0.0-20151124214156-6c035326b43f/go.mod h1:jhUB0rZB2TPWqy0yGugKRRictO591eSO7If7O4MfCaA= +github.com/martini-contrib/render v0.0.0-20150707142108-ec18f8345a11 h1:YFh+sjyJTMQSYjKwM4dFKhJPJC/wfo98tPUc17HdoYw= +github.com/martini-contrib/render v0.0.0-20150707142108-ec18f8345a11/go.mod h1:Ah2dBMoxZEqk118as2T4u4fjfXarE0pPnMJaArZQZsI= github.com/mattn/go-colorable v0.0.9 h1:UVL0vNpWh04HeJXV0KLcaT7r06gOH2l4OW6ddYRUIY4= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= github.com/mattn/go-colorable v0.1.4/go.mod h1:U0ppj6V5qS13XJ6of8GYAs25YV2eR4EVcfRqFIhoBtE= @@ -422,6 +437,8 @@ github.com/mattn/go-runewidth v0.0.2/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzp github.com/mattn/go-runewidth v0.0.3/go.mod h1:LwmH8dsx7+W8Uxz3IHJYH5QSwggIsqBzpuz5H//U1FU= github.com/mattn/go-runewidth v0.0.7 h1:Ei8KR0497xHyKJPAv59M1dkC+rOZCMBJ+t3fZ+twI54= github.com/mattn/go-runewidth v0.0.7/go.mod h1:H031xJmbD/WCDINGzjvQ9THkh0rPKHF+m2gUSrubnMI= +github.com/mattn/go-sqlite3 v1.14.0 h1:mLyGNKR8+Vv9CAU7PphKa2hkEqxxhn8i32J6FPj1/QA= +github.com/mattn/go-sqlite3 v1.14.0/go.mod h1:JIl7NbARA7phWnGvh0LKTyg7S9BA+6gx71ShQilpsus= github.com/matttproud/golang_protobuf_extensions v1.0.1 h1:4hp9jkHxhMHkqkrB3Ix0jegS5sx/RkqARlsWZ6pIwiU= github.com/matttproud/golang_protobuf_extensions v1.0.1/go.mod h1:D8He9yQNgCq6Z5Ld7szi9bcBfOoFv/3dc6xSMkL2PC0= github.com/miekg/dns v1.0.14 h1:9jZdLNd/P4+SfEJ0TNyxYpsK8N4GtfylBLqtbYN1sbA= @@ -453,6 +470,8 @@ github.com/modern-go/reflect2 v0.0.0-20180320133207-05fbef0ca5da/go.mod h1:bx2lN github.com/modern-go/reflect2 v0.0.0-20180701023420-4b7aa43c6742/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= github.com/modern-go/reflect2 v1.0.1 h1:9f412s+6RmYXLWZSEzVVgPGK7C2PphHj5RJrvfx9AWI= github.com/modern-go/reflect2 v1.0.1/go.mod h1:bx2lNnkwVCuqBIxFjflWJWanXIb3RllmbCylyMrvgv0= +github.com/montanaflynn/stats v0.6.3 h1:F8446DrvIF5V5smZfZ8K9nrmmix0AFgevPdLruGOmzk= +github.com/montanaflynn/stats v0.6.3/go.mod h1:wL8QJuTMNUDYhXwkmfOly8iTdp5TEcJFWZD2D7SIkUc= github.com/munnerz/goautoneg v0.0.0-20120707110453-a547fc61f48d/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20161129095857-cc309e4a2223/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= @@ -479,9 +498,14 @@ github.com/opentracing-contrib/go-grpc v0.0.0-20180928155321-4b5a12d3ff02 h1:0R5 github.com/opentracing-contrib/go-grpc v0.0.0-20180928155321-4b5a12d3ff02/go.mod h1:JNdpVEzCpXBgIiv4ds+TzhN1hrtxq6ClLrTlT9OQRSc= github.com/opentracing/opentracing-go v1.1.0 h1:pWlfV3Bxv7k65HYwkikxat0+s3pV4bsqf19k25Ur8rU= github.com/opentracing/opentracing-go v1.1.0/go.mod h1:UkNAQd3GIcIGf0SeVgPpRdFStlNbqXla1AfSYxPUl2o= +github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c h1:rp5dCmg/yLR3mgFuSOe4oEnDDmGLROTvMragMUXpTQw= +github.com/oxtoacart/bpool v0.0.0-20190530202638-03653db5a59c/go.mod h1:X07ZCGwUbLaax7L0S3Tw4hpejzu63ZrrQiUe6W0hcy0= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pascaldekloe/goe v0.1.0 h1:cBOtyMzM9HTpWjXfbbunk26uA6nG3a8n06Wieeh0MwY= github.com/pascaldekloe/goe v0.1.0/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= +github.com/patrickmn/go-cache v1.0.0 h1:3gD5McaYs9CxjyK5AXGcq8gdeCARtd/9gJDUvVeaZ0Y= +github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= +github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pborman/uuid v1.2.0 h1:J7Q5mO4ysT1dv8hyrUGHb9+ooztCXu1D8MY8DZYsu3g= github.com/pborman/uuid v1.2.0/go.mod h1:X/NO0urCmaxf9VXbdlT7C2Yzkj2IKimNn4k+gtPdI/k= github.com/pelletier/go-toml v1.2.0 h1:T5zMGML61Wp+FlcbWjRDT7yAxhJNAiPPLOFECq181zc= @@ -523,6 +547,8 @@ github.com/prometheus/procfs v0.0.0-20181204211112-1dc9a6cbc91a/go.mod h1:c3At6R github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA= github.com/prometheus/procfs v0.0.8 h1:+fpWZdT24pJBiqJdAwYBjPSk+5YmQzYNPYzQsdzLkt8= github.com/prometheus/procfs v0.0.8/go.mod h1:7Qr8sr6344vo1JqZ6HhLceV9o3AJ1Ff+GxbHq6oeK9A= +github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0 h1:MkV+77GLUNo5oJ0jf870itWm3D0Sjh7+Za9gazKc5LQ= +github.com/rcrowley/go-metrics v0.0.0-20200313005456-10cdbea86bc0/go.mod h1:bCqnVzQkZxMG4s8nGwiZ5l3QUCyqpo9Y+/ZMZ9VjZe4= github.com/remyoudompheng/bigfft v0.0.0-20170806203942-52369c62f446/go.mod h1:uYEyJGbgTkfkS4+E/PavXkNJcbFIpEtjt2B0KDQ5+9M= github.com/rogpeppe/fastuuid v0.0.0-20150106093220-6724a57986af/go.mod h1:XWv6SoW27p1b0cqNHllgS5HIMJraePCO15w5zCzIWYg= github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4= @@ -530,6 +556,8 @@ github.com/russross/blackfriday v1.5.2/go.mod h1:JO/DiYxRf+HjHt06OyowR9PTA263kcR github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= github.com/ryanuber/columnize v2.1.0+incompatible h1:j1Wcmh8OrK4Q7GXY+V7SVSY8nUWQxHW5TkBe7YUl+2s= github.com/ryanuber/columnize v2.1.0+incompatible/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= +github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e h1:CGjiMQ0wMH4wtNWrlj6kiTbkPt2F3rbYnhGX6TWLfco= +github.com/samuel/go-zookeeper v0.0.0-20200724154423-2164a8ac840e/go.mod h1:gi+0XIa01GRL2eRQVjQkKGqKF3SF9vZR/HnPullcV2E= github.com/satori/go.uuid v1.2.0 h1:0uYX9dsZ2yD7q2RtLRtPSdGDWzjeM3TbMJP9utgA0ww= github.com/satori/go.uuid v1.2.0/go.mod h1:dA0hQrYB0VpLJoorglMZABFdXlWrHn1NEOzdhQKdks0= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529 h1:nn5Wsu0esKSJiIVhscUtVbo7ada43DJhG55ua/hjS5I= @@ -538,6 +566,8 @@ github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAm github.com/sirupsen/logrus v1.2.0/go.mod h1:LxeOpSwHxABJmUn/MG1IvRgCAasNZTLOkJPxbbu5VWo= github.com/sirupsen/logrus v1.4.2 h1:SPIRibHv4MatM3XXNO2BJeFLZwZ2LvZgfQ5+UNI2im4= github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= +github.com/sjmudd/stopwatch v0.0.0-20170613150411-f380bf8a9be1 h1:acClJNSOjUrAUKW+ZneCZymCFDWtSaJG5YQl8FoOlyI= +github.com/sjmudd/stopwatch v0.0.0-20170613150411-f380bf8a9be1/go.mod h1:Pgf1sZ2KrHK8vdRTV5UHGp80LT7HMUKuNAiKC402abY= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304 h1:Jpy1PXuP99tXNrhbq2BaPz9B+jNAvH1JPQQpG/9GCXY= github.com/smartystreets/assertions v0.0.0-20190116191733-b6c0e53d7304/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= @@ -685,6 +715,8 @@ golang.org/x/net v0.0.0-20191004110552-13f9640d40b9 h1:rjwSpXsdiK0dV8/Naq3kAw9ym golang.org/x/net v0.0.0-20191004110552-13f9640d40b9/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= golang.org/x/net v0.0.0-20200202094626-16171245cfb2 h1:CCH4IOTTfewWjGOlSp+zGcjutRKlBEZQ6wTn8ozI/nI= golang.org/x/net v0.0.0-20200202094626-16171245cfb2/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e h1:3G+cUijn7XD+S4eJFddp53Pv7+slrESplyjG25HgL+k= +golang.org/x/net v0.0.0-20200324143707-d3edc9973b7e/go.mod h1:qpuaurCH72eLCgpAm/N6yyVIVM9cpaDIP3A8BGJEC5A= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45 h1:SVwTIAaPC2U/AvvLNZ2a7OVsmBpC8L5BlwK1whH3hm0= @@ -731,6 +763,8 @@ golang.org/x/sys v0.0.0-20200122134326-e047566fdf82/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20200124204421-9fbb57f87de9/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae h1:/WDfKMnPU+m5M4xB+6x4kaepxRw6jWvR5iDRdvjHgy8= golang.org/x/sys v0.0.0-20200223170610-d5e6a3e2c0ae/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd h1:xhmwyvizuTgC2qz7ZlMluP20uW+C3Rm0FD/WLDX8884= +golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/text v0.0.0-20160726164857-2910a502d2bf/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -817,6 +851,8 @@ gopkg.in/cheggaaa/pb.v1 v1.0.25/go.mod h1:V/YB90LKu/1FcN3WVnfiiE5oMCibMjukxqG/qS gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/fsnotify.v1 v1.4.7 h1:xOHLXZwVvI9hhs+cLKq5+I5onOuwQLhQwiu63xxlHs4= gopkg.in/fsnotify.v1 v1.4.7/go.mod h1:Tz8NjZHkW78fSQdbUxIjBTcgA1z1m8ZHf0WmKUhAMys= +gopkg.in/gcfg.v1 v1.2.3 h1:m8OOJ4ccYHnx2f4gQwpno8nAX5OGOh7RLaaz0pj3Ogs= +gopkg.in/gcfg.v1 v1.2.3/go.mod h1:yesOnuUOFQAhST5vPY4nbZsb/huCgGGXlipJsBn0b3o= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/ini.v1 v1.41.0 h1:Ka3ViY6gNYSKiVy71zXBEqKplnV35ImDLVG+8uoIklE= @@ -831,6 +867,8 @@ gopkg.in/resty.v1 v1.12.0/go.mod h1:mDo4pnntr5jdWRML875a/NmxYqAlA73dVijT2AXvQQo= gopkg.in/square/go-jose.v2 v2.2.2/go.mod h1:M9dMgbHiYLoDGQrXy7OpJDJWiKiU//h+vD76mk0e1AI= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkepLTh2hOroT7a+7czfdQ= gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw= +gopkg.in/warnings.v0 v0.1.2 h1:wFXVbFY8DY5/xOe1ECiWdKCzZlxgshcYVNkBHstARME= +gopkg.in/warnings.v0 v0.1.2/go.mod h1:jksf8JmL6Qr/oQM2OXTHunEvvTAsrWBLb6OOjuVWRNI= gopkg.in/yaml.v2 v2.0.0-20170812160011-eb3733d160e7/go.mod h1:JAlM8MvJe8wmxCU4Bli9HhUf9+ttbYbLASfIpnQbh74= gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.2 h1:ZCJp+EgiOT7lHqUV2J862kp8Qj64Jo6az82+3Td9dZw= diff --git a/go/cmd/orchestrator/main.go b/go/cmd/orchestrator/main.go new file mode 100644 index 0000000000..85b828f7ce --- /dev/null +++ b/go/cmd/orchestrator/main.go @@ -0,0 +1,171 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package main + +import ( + "flag" + "fmt" + "os" + + "vitess.io/vitess/go/vt/orchestrator/app" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +var AppVersion, GitCommit string + +// main is the application's entry point. It will either spawn a CLI or HTTP itnerfaces. +func main() { + configFile := flag.String("config", "", "config file name") + command := flag.String("c", "", "command, required. See full list of commands via 'orchestrator -c help'") + strict := flag.Bool("strict", false, "strict mode (more checks, slower)") + instance := flag.String("i", "", "instance, host_fqdn[:port] (e.g. db.company.com:3306, db.company.com)") + sibling := flag.String("s", "", "sibling instance, host_fqdn[:port]") + destination := flag.String("d", "", "destination instance, host_fqdn[:port] (synonym to -s)") + owner := flag.String("owner", "", "operation owner") + reason := flag.String("reason", "", "operation reason") + duration := flag.String("duration", "", "maintenance duration (format: 59s, 59m, 23h, 6d, 4w)") + pattern := flag.String("pattern", "", "regular expression pattern") + clusterAlias := flag.String("alias", "", "cluster alias") + pool := flag.String("pool", "", "Pool logical name (applies for pool-related commands)") + hostnameFlag := flag.String("hostname", "", "Hostname/fqdn/CNAME/VIP (applies for hostname/resolve related commands)") + discovery := flag.Bool("discovery", true, "auto discovery mode") + quiet := flag.Bool("quiet", false, "quiet") + verbose := flag.Bool("verbose", false, "verbose") + debug := flag.Bool("debug", false, "debug mode (very verbose)") + stack := flag.Bool("stack", false, "add stack trace upon error") + config.RuntimeCLIFlags.SkipBinlogSearch = flag.Bool("skip-binlog-search", false, "when matching via Pseudo-GTID, only use relay logs. This can save the hassle of searching for a non-existend pseudo-GTID entry, for example in servers with replication filters.") + config.RuntimeCLIFlags.SkipUnresolve = flag.Bool("skip-unresolve", false, "Do not unresolve a host name") + config.RuntimeCLIFlags.SkipUnresolveCheck = flag.Bool("skip-unresolve-check", false, "Skip/ignore checking an unresolve mapping (via hostname_unresolve table) resolves back to same hostname") + config.RuntimeCLIFlags.Noop = flag.Bool("noop", false, "Dry run; do not perform destructing operations") + config.RuntimeCLIFlags.BinlogFile = flag.String("binlog", "", "Binary log file name") + config.RuntimeCLIFlags.Statement = flag.String("statement", "", "Statement/hint") + config.RuntimeCLIFlags.GrabElection = flag.Bool("grab-election", false, "Grab leadership (only applies to continuous mode)") + config.RuntimeCLIFlags.PromotionRule = flag.String("promotion-rule", "prefer", "Promotion rule for register-andidate (prefer|neutral|prefer_not|must_not)") + config.RuntimeCLIFlags.Version = flag.Bool("version", false, "Print version and exit") + config.RuntimeCLIFlags.SkipContinuousRegistration = flag.Bool("skip-continuous-registration", false, "Skip cli commands performaing continuous registration (to reduce orchestratrator backend db load") + config.RuntimeCLIFlags.EnableDatabaseUpdate = flag.Bool("enable-database-update", false, "Enable database update, overrides SkipOrchestratorDatabaseUpdate") + config.RuntimeCLIFlags.IgnoreRaftSetup = flag.Bool("ignore-raft-setup", false, "Override RaftEnabled for CLI invocation (CLI by default not allowed for raft setups). NOTE: operations by CLI invocation may not reflect in all raft nodes.") + config.RuntimeCLIFlags.Tag = flag.String("tag", "", "tag to add ('tagname' or 'tagname=tagvalue') or to search ('tagname' or 'tagname=tagvalue' or comma separated 'tag0,tag1=val1,tag2' for intersection of all)") + flag.Parse() + + if *destination != "" && *sibling != "" { + log.Fatalf("-s and -d are synonyms, yet both were specified. You're probably doing the wrong thing.") + } + switch *config.RuntimeCLIFlags.PromotionRule { + case "prefer", "neutral", "prefer_not", "must_not": + { + // OK + } + default: + { + log.Fatalf("-promotion-rule only supports prefer|neutral|prefer_not|must_not") + } + } + if *destination == "" { + *destination = *sibling + } + + log.SetLevel(log.ERROR) + if *verbose { + log.SetLevel(log.INFO) + } + if *debug { + log.SetLevel(log.DEBUG) + } + if *stack { + log.SetPrintStackTrace(*stack) + } + if *config.RuntimeCLIFlags.Version { + fmt.Println(AppVersion) + fmt.Println(GitCommit) + return + } + + startText := "starting orchestrator" + if AppVersion != "" { + startText += ", version: " + AppVersion + } + if GitCommit != "" { + startText += ", git commit: " + GitCommit + } + log.Info(startText) + + if len(*configFile) > 0 { + config.ForceRead(*configFile) + } else { + config.Read("/etc/orchestrator.conf.json", "conf/orchestrator.conf.json", "orchestrator.conf.json") + } + if *config.RuntimeCLIFlags.EnableDatabaseUpdate { + config.Config.SkipOrchestratorDatabaseUpdate = false + } + if config.Config.Debug { + log.SetLevel(log.DEBUG) + } + if *quiet { + // Override!! + log.SetLevel(log.ERROR) + } + if config.Config.EnableSyslog { + log.EnableSyslogWriter("orchestrator") + log.SetSyslogLevel(log.INFO) + } + if config.Config.AuditToSyslog { + inst.EnableAuditSyslog() + } + config.RuntimeCLIFlags.ConfiguredVersion = AppVersion + config.MarkConfigurationLoaded() + + if len(flag.Args()) == 0 && *command == "" { + // No command, no argument: just prompt + fmt.Println(app.AppPrompt) + return + } + + helpTopic := "" + if flag.Arg(0) == "help" { + if flag.Arg(1) != "" { + helpTopic = flag.Arg(1) + } + if helpTopic == "" { + helpTopic = *command + } + if helpTopic == "" { + // hacky way to make the CLI kick in as if the user typed `orchestrator -c help cli` + *command = "help" + flag.Args()[0] = "cli" + } + } + + switch { + case helpTopic != "": + app.HelpCommand(helpTopic) + case len(flag.Args()) == 0 || flag.Arg(0) == "cli": + app.CliWrapper(*command, *strict, *instance, *destination, *owner, *reason, *duration, *pattern, *clusterAlias, *pool, *hostnameFlag) + case flag.Arg(0) == "http": + app.Http(*discovery) + default: + fmt.Fprintln(os.Stderr, `Usage: + orchestrator --options... [cli|http] +See complete list of commands: + orchestrator -c help +Full blown documentation: + orchestrator`) + os.Exit(1) + } +} diff --git a/go/vt/orchestrator/agent/agent.go b/go/vt/orchestrator/agent/agent.go new file mode 100644 index 0000000000..775a24af16 --- /dev/null +++ b/go/vt/orchestrator/agent/agent.go @@ -0,0 +1,82 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package agent + +import "vitess.io/vitess/go/vt/orchestrator/inst" + +// LogicalVolume describes an LVM volume +type LogicalVolume struct { + Name string + GroupName string + Path string + IsSnapshot bool + SnapshotPercent float64 +} + +// Mount describes a file system mount point +type Mount struct { + Path string + Device string + LVPath string + FileSystem string + IsMounted bool + DiskUsage int64 + MySQLDataPath string + MySQLDiskUsage int64 +} + +// Agent presents the data of an agent +type Agent struct { + Hostname string + Port int + Token string + LastSubmitted string + AvailableLocalSnapshots []string + AvailableSnapshots []string + LogicalVolumes []LogicalVolume + MountPoint Mount + MySQLRunning bool + MySQLDiskUsage int64 + MySQLPort int64 + MySQLDatadirDiskFree int64 + MySQLErrorLogTail []string +} + +// SeedOperation makes for the high level data & state of a seed operation +type SeedOperation struct { + SeedId int64 + TargetHostname string + SourceHostname string + StartTimestamp string + EndTimestamp string + IsComplete bool + IsSuccessful bool +} + +// SeedOperationState represents a single state (step) in a seed operation +type SeedOperationState struct { + SeedStateId int64 + SeedId int64 + StateTimestamp string + Action string + ErrorMessage string +} + +// Build an instance key for a given agent +func (this *Agent) GetInstance() *inst.InstanceKey { + return &inst.InstanceKey{Hostname: this.Hostname, Port: int(this.MySQLPort)} +} diff --git a/go/vt/orchestrator/agent/agent_dao.go b/go/vt/orchestrator/agent/agent_dao.go new file mode 100644 index 0000000000..52cc53cf22 --- /dev/null +++ b/go/vt/orchestrator/agent/agent_dao.go @@ -0,0 +1,944 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package agent + +import ( + "crypto/tls" + "encoding/json" + "errors" + "fmt" + "io/ioutil" + "net" + "net/http" + "strings" + "sync" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +type httpMethodFunc func(uri string) (resp *http.Response, err error) + +var SeededAgents chan *Agent = make(chan *Agent) + +var httpClient *http.Client +var httpClientMutex = &sync.Mutex{} + +// InitHttpClient gets called once, and initializes httpClient according to config.Config +func InitHttpClient() { + httpClientMutex.Lock() + defer httpClientMutex.Unlock() + + if httpClient != nil { + return + } + + httpTimeout := time.Duration(time.Duration(config.AgentHttpTimeoutSeconds) * time.Second) + dialTimeout := func(network, addr string) (net.Conn, error) { + return net.DialTimeout(network, addr, httpTimeout) + } + httpTransport := &http.Transport{ + TLSClientConfig: &tls.Config{InsecureSkipVerify: config.Config.AgentSSLSkipVerify}, + Dial: dialTimeout, + ResponseHeaderTimeout: httpTimeout, + } + httpClient = &http.Client{Transport: httpTransport} +} + +// httpGet is a convenience method for getting http response from URL, optionaly skipping SSL cert verification +func httpGet(url string) (resp *http.Response, err error) { + return httpClient.Get(url) +} + +// httpPost is a convenience method for posting text data +func httpPost(url string, bodyType string, content string) (resp *http.Response, err error) { + return httpClient.Post(url, bodyType, strings.NewReader(content)) +} + +// AuditAgentOperation creates and writes a new audit entry by given agent +func auditAgentOperation(auditType string, agent *Agent, message string) error { + instanceKey := &inst.InstanceKey{} + if agent != nil { + instanceKey = &inst.InstanceKey{Hostname: agent.Hostname, Port: int(agent.MySQLPort)} + } + return inst.AuditOperation(auditType, instanceKey, message) +} + +// readResponse returns the body of an HTTP response +func readResponse(res *http.Response, err error) ([]byte, error) { + if err != nil { + return nil, err + } + defer res.Body.Close() + + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return nil, err + } + + if res.Status == "500" { + return body, errors.New("Response Status 500") + } + + return body, nil +} + +// SubmitAgent submits a new agent for listing +func SubmitAgent(hostname string, port int, token string) (string, error) { + _, err := db.ExecOrchestrator(` + replace + into host_agent ( + hostname, port, token, last_submitted, count_mysql_snapshots + ) VALUES ( + ?, ?, ?, NOW(), 0 + ) + `, + hostname, + port, + token, + ) + if err != nil { + return "", log.Errore(err) + } + + // Try to discover topology instances when an agent submits + go DiscoverAgentInstance(hostname, port) + + return hostname, err +} + +// If a mysql port is available, try to discover against it +func DiscoverAgentInstance(hostname string, port int) error { + agent, err := GetAgent(hostname) + if err != nil { + log.Errorf("Couldn't get agent for %s: %v", hostname, err) + return err + } + + instanceKey := agent.GetInstance() + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Errorf("Failed to read topology for %v. err=%+v", instanceKey, err) + return err + } + if instance == nil { + log.Errorf("Failed to read topology for %v", instanceKey) + return err + } + log.Infof("Discovered Agent Instance: %v", instance.Key) + return nil +} + +// ForgetLongUnseenAgents will remove entries of all agents that have long since been last seen. +func ForgetLongUnseenAgents() error { + _, err := db.ExecOrchestrator(` + delete + from host_agent + where + last_submitted < NOW() - interval ? hour`, + config.Config.UnseenAgentForgetHours, + ) + return err +} + +// ReadOutdatedAgentsHosts returns agents that need to be updated +func ReadOutdatedAgentsHosts() ([]string, error) { + res := []string{} + query := ` + select + hostname + from + host_agent + where + IFNULL(last_checked < now() - interval ? minute, 1) + ` + err := db.QueryOrchestrator(query, sqlutils.Args(config.Config.AgentPollMinutes), func(m sqlutils.RowMap) error { + hostname := m.GetString("hostname") + res = append(res, hostname) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// ReadAgents returns a list of all known agents +func ReadAgents() ([]Agent, error) { + res := []Agent{} + query := ` + select + hostname, + port, + token, + last_submitted, + mysql_port + from + host_agent + order by + hostname + ` + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + agent := Agent{} + agent.Hostname = m.GetString("hostname") + agent.Port = m.GetInt("port") + agent.MySQLPort = m.GetInt64("mysql_port") + agent.Token = "" + agent.LastSubmitted = m.GetString("last_submitted") + + res = append(res, agent) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err + +} + +// readAgentBasicInfo returns the basic data for an agent directly from backend table (no agent access) +func readAgentBasicInfo(hostname string) (Agent, string, error) { + agent := Agent{} + token := "" + query := ` + select + hostname, + port, + token, + last_submitted, + mysql_port + from + host_agent + where + hostname = ? + ` + err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { + agent.Hostname = m.GetString("hostname") + agent.Port = m.GetInt("port") + agent.LastSubmitted = m.GetString("last_submitted") + agent.MySQLPort = m.GetInt64("mysql_port") + token = m.GetString("token") + + return nil + }) + if err != nil { + return agent, "", err + } + + if token == "" { + return agent, "", log.Errorf("Cannot get agent/token: %s", hostname) + } + return agent, token, nil +} + +// UpdateAgentLastChecked updates the last_check timestamp in the orchestrator backed database +// for a given agent +func UpdateAgentLastChecked(hostname string) error { + _, err := db.ExecOrchestrator(` + update + host_agent + set + last_checked = NOW() + where + hostname = ?`, + hostname, + ) + if err != nil { + return log.Errore(err) + } + + return nil +} + +// UpdateAgentInfo updates some agent state in backend table +func UpdateAgentInfo(hostname string, agent Agent) error { + _, err := db.ExecOrchestrator(` + update + host_agent + set + last_seen = NOW(), + mysql_port = ?, + count_mysql_snapshots = ? + where + hostname = ?`, + agent.MySQLPort, + len(agent.LogicalVolumes), + hostname, + ) + if err != nil { + return log.Errore(err) + } + + return nil +} + +// baseAgentUri returns the base URI for accessing an agent +func baseAgentUri(agentHostname string, agentPort int) string { + protocol := "http" + if config.Config.AgentsUseSSL { + protocol = "https" + } + uri := fmt.Sprintf("%s://%s:%d/api", protocol, agentHostname, agentPort) + log.Debugf("orchestrator-agent uri: %s", uri) + return uri +} + +// GetAgent gets a single agent status from the agent service. This involves multiple HTTP requests. +func GetAgent(hostname string) (Agent, error) { + agent, token, err := readAgentBasicInfo(hostname) + if err != nil { + return agent, log.Errore(err) + } + + // All seems to be in order. Now make some inquiries from orchestrator-agent service: + { + uri := baseAgentUri(agent.Hostname, agent.Port) + log.Debugf("orchestrator-agent uri: %s", uri) + + { + availableLocalSnapshotsUri := fmt.Sprintf("%s/available-snapshots-local?token=%s", uri, token) + body, err := readResponse(httpGet(availableLocalSnapshotsUri)) + if err == nil { + err = json.Unmarshal(body, &agent.AvailableLocalSnapshots) + } + if err != nil { + log.Errore(err) + } + } + { + availableSnapshotsUri := fmt.Sprintf("%s/available-snapshots?token=%s", uri, token) + body, err := readResponse(httpGet(availableSnapshotsUri)) + if err == nil { + err = json.Unmarshal(body, &agent.AvailableSnapshots) + } + if err != nil { + log.Errore(err) + } + } + { + lvSnapshotsUri := fmt.Sprintf("%s/lvs-snapshots?token=%s", uri, token) + body, err := readResponse(httpGet(lvSnapshotsUri)) + if err == nil { + err = json.Unmarshal(body, &agent.LogicalVolumes) + } + if err != nil { + log.Errore(err) + } + } + { + mountUri := fmt.Sprintf("%s/mount?token=%s", uri, token) + body, err := readResponse(httpGet(mountUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MountPoint) + } + if err != nil { + log.Errore(err) + } + } + { + mySQLRunningUri := fmt.Sprintf("%s/mysql-status?token=%s", uri, token) + body, err := readResponse(httpGet(mySQLRunningUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MySQLRunning) + } + // Actually an error is OK here since "status" returns with non-zero exit code when MySQL not running + } + { + mySQLRunningUri := fmt.Sprintf("%s/mysql-port?token=%s", uri, token) + body, err := readResponse(httpGet(mySQLRunningUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MySQLPort) + } + if err != nil { + log.Errore(err) + } + } + { + mySQLDiskUsageUri := fmt.Sprintf("%s/mysql-du?token=%s", uri, token) + body, err := readResponse(httpGet(mySQLDiskUsageUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MySQLDiskUsage) + } + if err != nil { + log.Errore(err) + } + } + { + mySQLDatadirDiskFreeUri := fmt.Sprintf("%s/mysql-datadir-available-space?token=%s", uri, token) + body, err := readResponse(httpGet(mySQLDatadirDiskFreeUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MySQLDatadirDiskFree) + } + if err != nil { + log.Errore(err) + } + } + { + errorLogTailUri := fmt.Sprintf("%s/mysql-error-log-tail?token=%s", uri, token) + body, err := readResponse(httpGet(errorLogTailUri)) + if err == nil { + err = json.Unmarshal(body, &agent.MySQLErrorLogTail) + } + if err != nil { + log.Errore(err) + } + } + } + return agent, err +} + +// executeAgentCommandWithMethodFunc requests an agent to execute a command via HTTP api, either GET or POST, +// with specific http method implementation by the caller +func executeAgentCommandWithMethodFunc(hostname string, command string, methodFunc httpMethodFunc, onResponse *func([]byte)) (Agent, error) { + agent, token, err := readAgentBasicInfo(hostname) + if err != nil { + return agent, err + } + + // All seems to be in order. Now make some inquiries from orchestrator-agent service: + uri := baseAgentUri(agent.Hostname, agent.Port) + + var fullCommand string + if strings.Contains(command, "?") { + fullCommand = fmt.Sprintf("%s&token=%s", command, token) + } else { + fullCommand = fmt.Sprintf("%s?token=%s", command, token) + } + log.Debugf("orchestrator-agent command: %s", fullCommand) + agentCommandUri := fmt.Sprintf("%s/%s", uri, fullCommand) + + body, err := readResponse(methodFunc(agentCommandUri)) + if err != nil { + return agent, log.Errore(err) + } + if onResponse != nil { + (*onResponse)(body) + } + auditAgentOperation("agent-command", &agent, command) + + return agent, err +} + +// executeAgentCommand requests an agent to execute a command via HTTP api +func executeAgentCommand(hostname string, command string, onResponse *func([]byte)) (Agent, error) { + httpFunc := func(uri string) (resp *http.Response, err error) { + return httpGet(uri) + } + return executeAgentCommandWithMethodFunc(hostname, command, httpFunc, onResponse) +} + +// executeAgentPostCommand requests an agent to execute a command via HTTP POST +func executeAgentPostCommand(hostname string, command string, content string, onResponse *func([]byte)) (Agent, error) { + httpFunc := func(uri string) (resp *http.Response, err error) { + return httpPost(uri, "text/plain", content) + } + return executeAgentCommandWithMethodFunc(hostname, command, httpFunc, onResponse) +} + +// Unmount unmounts the designated snapshot mount point +func Unmount(hostname string) (Agent, error) { + return executeAgentCommand(hostname, "umount", nil) +} + +// MountLV requests an agent to mount the given volume on the designated mount point +func MountLV(hostname string, lv string) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("mountlv?lv=%s", lv), nil) +} + +// RemoveLV requests an agent to remove a snapshot +func RemoveLV(hostname string, lv string) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("removelv?lv=%s", lv), nil) +} + +// CreateSnapshot requests an agent to create a new snapshot -- a DIY implementation +func CreateSnapshot(hostname string) (Agent, error) { + return executeAgentCommand(hostname, "create-snapshot", nil) +} + +// deleteMySQLDatadir requests an agent to purge the MySQL data directory (step before seed) +func deleteMySQLDatadir(hostname string) (Agent, error) { + return executeAgentCommand(hostname, "delete-mysql-datadir", nil) +} + +// MySQLStop requests an agent to stop MySQL service +func MySQLStop(hostname string) (Agent, error) { + return executeAgentCommand(hostname, "mysql-stop", nil) +} + +// MySQLStart requests an agent to start the MySQL service +func MySQLStart(hostname string) (Agent, error) { + return executeAgentCommand(hostname, "mysql-start", nil) +} + +// ReceiveMySQLSeedData requests an agent to start listening for incoming seed data +func ReceiveMySQLSeedData(hostname string, seedId int64) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("receive-mysql-seed-data/%d", seedId), nil) +} + +// ReceiveMySQLSeedData requests an agent to start sending seed data +func SendMySQLSeedData(hostname string, targetHostname string, seedId int64) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("send-mysql-seed-data/%s/%d", targetHostname, seedId), nil) +} + +// ReceiveMySQLSeedData requests an agent to abort seed send/receive (depending on the agent's role) +func AbortSeedCommand(hostname string, seedId int64) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("abort-seed/%d", seedId), nil) +} + +func CustomCommand(hostname string, cmd string) (output string, err error) { + onResponse := func(body []byte) { + output = string(body) + log.Debugf("output: %v", output) + } + + _, err = executeAgentCommand(hostname, fmt.Sprintf("custom-commands/%s", cmd), &onResponse) + return output, err +} + +// seedCommandCompleted checks an agent to see if it thinks a seed was completed. +func seedCommandCompleted(hostname string, seedId int64) (Agent, bool, error) { + result := false + onResponse := func(body []byte) { + json.Unmarshal(body, &result) + } + agent, err := executeAgentCommand(hostname, fmt.Sprintf("seed-command-completed/%d", seedId), &onResponse) + return agent, result, err +} + +// seedCommandCompleted checks an agent to see if it thinks a seed was successful. +func seedCommandSucceeded(hostname string, seedId int64) (Agent, bool, error) { + result := false + onResponse := func(body []byte) { + json.Unmarshal(body, &result) + } + agent, err := executeAgentCommand(hostname, fmt.Sprintf("seed-command-succeeded/%d", seedId), &onResponse) + return agent, result, err +} + +// AbortSeed will contact agents associated with a seed and request abort. +func AbortSeed(seedId int64) error { + seedOperations, err := AgentSeedDetails(seedId) + if err != nil { + return log.Errore(err) + } + + for _, seedOperation := range seedOperations { + AbortSeedCommand(seedOperation.TargetHostname, seedId) + AbortSeedCommand(seedOperation.SourceHostname, seedId) + } + updateSeedComplete(seedId, errors.New("Aborted")) + return nil +} + +// PostCopy will request an agent to invoke post-copy commands +func PostCopy(hostname, sourceHostname string) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("post-copy/?sourceHost=%s", sourceHostname), nil) +} + +// SubmitSeedEntry submits a new seed operation entry, returning its unique ID +func SubmitSeedEntry(targetHostname string, sourceHostname string) (int64, error) { + res, err := db.ExecOrchestrator(` + insert + into agent_seed ( + target_hostname, source_hostname, start_timestamp + ) VALUES ( + ?, ?, NOW() + ) + `, + targetHostname, + sourceHostname, + ) + if err != nil { + return 0, log.Errore(err) + } + id, err := res.LastInsertId() + + return id, err +} + +// updateSeedComplete updates the seed entry, signing for completion +func updateSeedComplete(seedId int64, seedError error) error { + _, err := db.ExecOrchestrator(` + update + agent_seed + set end_timestamp = NOW(), + is_complete = 1, + is_successful = ? + where + agent_seed_id = ? + `, + (seedError == nil), + seedId, + ) + if err != nil { + return log.Errore(err) + } + + return nil +} + +// submitSeedStateEntry submits a seed state: a single step in the overall seed process +func submitSeedStateEntry(seedId int64, action string, errorMessage string) (int64, error) { + res, err := db.ExecOrchestrator(` + insert + into agent_seed_state ( + agent_seed_id, state_timestamp, state_action, error_message + ) VALUES ( + ?, NOW(), ?, ? + ) + `, + seedId, + action, + errorMessage, + ) + if err != nil { + return 0, log.Errore(err) + } + id, err := res.LastInsertId() + + return id, err +} + +// updateSeedStateEntry updates seed step state +func updateSeedStateEntry(seedStateId int64, reason error) error { + _, err := db.ExecOrchestrator(` + update + agent_seed_state + set error_message = ? + where + agent_seed_state_id = ? + `, + reason.Error(), + seedStateId, + ) + if err != nil { + return log.Errore(err) + } + + return reason +} + +// FailStaleSeeds marks as failed seeds where no progress have been seen recently +func FailStaleSeeds() error { + _, err := db.ExecOrchestrator(` + update + agent_seed + set + is_complete=1, + is_successful=0 + where + is_complete=0 + and ( + select + max(state_timestamp) as last_state_timestamp + from + agent_seed_state + where + agent_seed.agent_seed_id = agent_seed_state.agent_seed_id + ) < now() - interval ? minute`, + config.Config.StaleSeedFailMinutes, + ) + return err +} + +// executeSeed is *the* function for taking a seed. It is a complex operation of testing, preparing, re-testing +// agents on both sides, initiating data transfer, following up, awaiting completion, diagnosing errors, claning up. +func executeSeed(seedId int64, targetHostname string, sourceHostname string) error { + + var err error + var seedStateId int64 + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("getting target agent info for %s", targetHostname), "") + targetAgent, err := GetAgent(targetHostname) + SeededAgents <- &targetAgent + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("getting source agent info for %s", sourceHostname), "") + sourceAgent, err := GetAgent(sourceHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Checking MySQL status on target %s", targetHostname), "") + if targetAgent.MySQLRunning { + return updateSeedStateEntry(seedStateId, errors.New("MySQL is running on target host. Cowardly refusing to proceeed. Please stop the MySQL service")) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Looking up available snapshots on source %s", sourceHostname), "") + if len(sourceAgent.LogicalVolumes) == 0 { + return updateSeedStateEntry(seedStateId, errors.New("No logical volumes found on source host")) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Checking mount point on source %s", sourceHostname), "") + if sourceAgent.MountPoint.IsMounted { + return updateSeedStateEntry(seedStateId, errors.New("Volume already mounted on source host; please unmount")) + } + + seedFromLogicalVolume := sourceAgent.LogicalVolumes[0] + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("%s Mounting logical volume: %s", sourceHostname, seedFromLogicalVolume.Path), "") + _, err = MountLV(sourceHostname, seedFromLogicalVolume.Path) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + sourceAgent, err = GetAgent(sourceHostname) + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("MySQL data volume on source host %s is %d bytes", sourceHostname, sourceAgent.MountPoint.MySQLDiskUsage), "") + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Erasing MySQL data on %s", targetHostname), "") + _, err = deleteMySQLDatadir(targetHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Aquiring target host datadir free space on %s", targetHostname), "") + targetAgent, err = GetAgent(targetHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + if sourceAgent.MountPoint.MySQLDiskUsage > targetAgent.MySQLDatadirDiskFree { + Unmount(sourceHostname) + return updateSeedStateEntry(seedStateId, fmt.Errorf("Not enough disk space on target host %s. Required: %d, available: %d. Bailing out.", targetHostname, sourceAgent.MountPoint.MySQLDiskUsage, targetAgent.MySQLDatadirDiskFree)) + } + + // ... + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("%s will now receive data in background", targetHostname), "") + ReceiveMySQLSeedData(targetHostname, seedId) + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Waiting %d seconds for %s to start listening for incoming data", config.Config.SeedWaitSecondsBeforeSend, targetHostname), "") + time.Sleep(time.Duration(config.Config.SeedWaitSecondsBeforeSend) * time.Second) + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("%s will now send data to %s in background", sourceHostname, targetHostname), "") + SendMySQLSeedData(sourceHostname, targetHostname, seedId) + + copyComplete := false + numStaleIterations := 0 + var bytesCopied int64 = 0 + + for !copyComplete { + targetAgentPoll, err := GetAgent(targetHostname) + if err != nil { + return log.Errore(err) + } + + if targetAgentPoll.MySQLDiskUsage == bytesCopied { + numStaleIterations++ + } + bytesCopied = targetAgentPoll.MySQLDiskUsage + + copyFailed := false + if _, commandCompleted, _ := seedCommandCompleted(targetHostname, seedId); commandCompleted { + copyComplete = true + if _, commandSucceeded, _ := seedCommandSucceeded(targetHostname, seedId); !commandSucceeded { + // failed. + copyFailed = true + } + } + if numStaleIterations > 10 { + copyFailed = true + } + if copyFailed { + AbortSeedCommand(sourceHostname, seedId) + AbortSeedCommand(targetHostname, seedId) + Unmount(sourceHostname) + return updateSeedStateEntry(seedStateId, errors.New("10 iterations have passed without progress. Bailing out.")) + } + + var copyPct int64 = 0 + if sourceAgent.MountPoint.MySQLDiskUsage > 0 { + copyPct = 100 * bytesCopied / sourceAgent.MountPoint.MySQLDiskUsage + } + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Copied %d/%d bytes (%d%%)", bytesCopied, sourceAgent.MountPoint.MySQLDiskUsage, copyPct), "") + + if !copyComplete { + time.Sleep(30 * time.Second) + } + } + + // Cleanup: + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Executing post-copy command on %s", targetHostname), "") + _, err = PostCopy(targetHostname, sourceHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("%s Unmounting logical volume: %s", sourceHostname, seedFromLogicalVolume.Path), "") + _, err = Unmount(sourceHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Starting MySQL on target: %s", targetHostname), "") + _, err = MySQLStart(targetHostname) + if err != nil { + return updateSeedStateEntry(seedStateId, err) + } + + seedStateId, _ = submitSeedStateEntry(seedId, fmt.Sprintf("Submitting MySQL instance for discovery: %s", targetHostname), "") + SeededAgents <- &targetAgent + + seedStateId, _ = submitSeedStateEntry(seedId, "Done", "") + + return nil +} + +// Seed is the entry point for making a seed +func Seed(targetHostname string, sourceHostname string) (int64, error) { + if targetHostname == sourceHostname { + return 0, log.Errorf("Cannot seed %s onto itself", targetHostname) + } + seedId, err := SubmitSeedEntry(targetHostname, sourceHostname) + if err != nil { + return 0, log.Errore(err) + } + + go func() { + err := executeSeed(seedId, targetHostname, sourceHostname) + updateSeedComplete(seedId, err) + }() + + return seedId, nil +} + +// readSeeds reads seed from the backend table +func readSeeds(whereCondition string, args []interface{}, limit string) ([]SeedOperation, error) { + res := []SeedOperation{} + query := fmt.Sprintf(` + select + agent_seed_id, + target_hostname, + source_hostname, + start_timestamp, + end_timestamp, + is_complete, + is_successful + from + agent_seed + %s + order by + agent_seed_id desc + %s + `, whereCondition, limit) + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + seedOperation := SeedOperation{} + seedOperation.SeedId = m.GetInt64("agent_seed_id") + seedOperation.TargetHostname = m.GetString("target_hostname") + seedOperation.SourceHostname = m.GetString("source_hostname") + seedOperation.StartTimestamp = m.GetString("start_timestamp") + seedOperation.EndTimestamp = m.GetString("end_timestamp") + seedOperation.IsComplete = m.GetBool("is_complete") + seedOperation.IsSuccessful = m.GetBool("is_successful") + + res = append(res, seedOperation) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// ReadActiveSeedsForHost reads active seeds where host participates either as source or target +func ReadActiveSeedsForHost(hostname string) ([]SeedOperation, error) { + whereCondition := ` + where + is_complete = 0 + and ( + target_hostname = ? + or source_hostname = ? + ) + ` + return readSeeds(whereCondition, sqlutils.Args(hostname, hostname), "") +} + +// ReadRecentCompletedSeedsForHost reads active seeds where host participates either as source or target +func ReadRecentCompletedSeedsForHost(hostname string) ([]SeedOperation, error) { + whereCondition := ` + where + is_complete = 1 + and ( + target_hostname = ? + or source_hostname = ? + ) + ` + return readSeeds(whereCondition, sqlutils.Args(hostname, hostname), "limit 10") +} + +// AgentSeedDetails reads details from backend table +func AgentSeedDetails(seedId int64) ([]SeedOperation, error) { + whereCondition := ` + where + agent_seed_id = ? + ` + return readSeeds(whereCondition, sqlutils.Args(seedId), "") +} + +// ReadRecentSeeds reads seeds from backend table. +func ReadRecentSeeds() ([]SeedOperation, error) { + return readSeeds(``, sqlutils.Args(), "limit 100") +} + +// SeedOperationState reads states for a given seed operation +func ReadSeedStates(seedId int64) ([]SeedOperationState, error) { + res := []SeedOperationState{} + query := ` + select + agent_seed_state_id, + agent_seed_id, + state_timestamp, + state_action, + error_message + from + agent_seed_state + where + agent_seed_id = ? + order by + agent_seed_state_id desc + ` + err := db.QueryOrchestrator(query, sqlutils.Args(seedId), func(m sqlutils.RowMap) error { + seedState := SeedOperationState{} + seedState.SeedStateId = m.GetInt64("agent_seed_state_id") + seedState.SeedId = m.GetInt64("agent_seed_id") + seedState.StateTimestamp = m.GetString("state_timestamp") + seedState.Action = m.GetString("state_action") + seedState.ErrorMessage = m.GetString("error_message") + + res = append(res, seedState) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +func RelaylogContentsTail(hostname string, startCoordinates *inst.BinlogCoordinates, onResponse *func([]byte)) (Agent, error) { + return executeAgentCommand(hostname, fmt.Sprintf("mysql-relaylog-contents-tail/%s/%d", startCoordinates.LogFile, startCoordinates.LogPos), onResponse) +} + +func ApplyRelaylogContents(hostname string, content string) (Agent, error) { + return executeAgentPostCommand(hostname, "apply-relaylog-contents", content, nil) +} diff --git a/go/vt/orchestrator/agent/instance_topology_agent.go b/go/vt/orchestrator/agent/instance_topology_agent.go new file mode 100644 index 0000000000..7a2739999c --- /dev/null +++ b/go/vt/orchestrator/agent/instance_topology_agent.go @@ -0,0 +1,78 @@ +/* + Copyright 2017 GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package agent + +import ( + "encoding/json" + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +func SyncReplicaRelayLogs(instance, otherInstance *inst.Instance) (*inst.Instance, error) { + var err error + var found bool + var nextCoordinates *inst.BinlogCoordinates + var content string + onResponse := func(contentBytes []byte) { + json.Unmarshal(contentBytes, &content) + } + log.Debugf("SyncReplicaRelayLogs: stopping replication") + + if !instance.ReplicationThreadsStopped() { + return instance, log.Errorf("SyncReplicaRelayLogs: replication on %+v must not run", instance.Key) + } + if !otherInstance.ReplicationThreadsStopped() { + return instance, log.Errorf("SyncReplicaRelayLogs: replication on %+v must not run", otherInstance.Key) + } + + log.Debugf("SyncReplicaRelayLogs: correlating coordinates of %+v on %+v", instance.Key, otherInstance.Key) + _, _, nextCoordinates, found, err = inst.CorrelateRelaylogCoordinates(instance, nil, otherInstance) + if err != nil { + goto Cleanup + } + if !found { + goto Cleanup + } + log.Debugf("SyncReplicaRelayLogs: correlated next-coordinates are %+v", *nextCoordinates) + + InitHttpClient() + if _, err := RelaylogContentsTail(otherInstance.Key.Hostname, nextCoordinates, &onResponse); err != nil { + goto Cleanup + } + log.Debugf("SyncReplicaRelayLogs: got content (%d bytes)", len(content)) + + if _, err := ApplyRelaylogContents(instance.Key.Hostname, content); err != nil { + goto Cleanup + } + log.Debugf("SyncReplicaRelayLogs: applied content (%d bytes)", len(content)) + + instance, err = inst.ChangeMasterTo(&instance.Key, &otherInstance.MasterKey, &otherInstance.ExecBinlogCoordinates, false, inst.GTIDHintNeutral) + if err != nil { + goto Cleanup + } + +Cleanup: + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + inst.AuditOperation("align-via-relaylogs", &instance.Key, fmt.Sprintf("aligned %+v by relaylogs from %+v", instance.Key, otherInstance.Key)) + + return instance, err +} diff --git a/go/vt/orchestrator/app/cli.go b/go/vt/orchestrator/app/cli.go new file mode 100644 index 0000000000..7afa2f13b6 --- /dev/null +++ b/go/vt/orchestrator/app/cli.go @@ -0,0 +1,1795 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package app + +import ( + "fmt" + "net" + "os" + "os/user" + "regexp" + "sort" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/agent" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/util" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/kv" + "vitess.io/vitess/go/vt/orchestrator/logic" + "vitess.io/vitess/go/vt/orchestrator/process" +) + +var thisInstanceKey *inst.InstanceKey +var knownCommands []CliCommand + +type CliCommand struct { + Command string + Section string + Description string +} + +type stringSlice []string + +func (a stringSlice) Len() int { return len(a) } +func (a stringSlice) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a stringSlice) Less(i, j int) bool { return a[i] < a[j] } + +var commandSynonyms = map[string]string{ + "stop-slave": "stop-replica", + "start-slave": "start-replica", + "restart-slave": "restart-replica", + "reset-slave": "reset-replica", + "restart-slave-statements": "restart-replica-statements", + "relocate-slaves": "relocate-replicas", + "regroup-slaves": "regroup-replicas", + "move-up-slaves": "move-up-replicas", + "repoint-slaves": "repoint-replicas", + "enslave-siblings": "take-siblings", + "enslave-master": "take-master", + "get-candidate-slave": "get-candidate-replica", + "move-slaves-gtid": "move-replicas-gtid", + "regroup-slaves-gtid": "regroup-replicas-gtid", + "match-slaves": "match-replicas", + "match-up-slaves": "match-up-replicas", + "regroup-slaves-pgtid": "regroup-replicas-pgtid", + "which-cluster-osc-slaves": "which-cluster-osc-replicas", + "which-cluster-gh-ost-slaves": "which-cluster-gh-ost-replicas", + "which-slaves": "which-replicas", + "detach-slave": "detach-replica-master-host", + "detach-replica": "detach-replica-master-host", + "detach-slave-master-host": "detach-replica-master-host", + "reattach-slave": "reattach-replica-master-host", + "reattach-replica": "reattach-replica-master-host", + "reattach-slave-master-host": "reattach-replica-master-host", +} + +func registerCliCommand(command string, section string, description string) string { + if synonym, ok := commandSynonyms[command]; ok { + command = synonym + } + knownCommands = append(knownCommands, CliCommand{Command: command, Section: section, Description: description}) + + return command +} + +func commandsListing() string { + listing := []string{} + lastSection := "" + for _, cliCommand := range knownCommands { + if lastSection != cliCommand.Section { + lastSection = cliCommand.Section + listing = append(listing, fmt.Sprintf("%s:", cliCommand.Section)) + } + commandListing := fmt.Sprintf("\t%-40s%s", cliCommand.Command, cliCommand.Description) + listing = append(listing, commandListing) + } + return strings.Join(listing, "\n") +} + +func availableCommandsUsage() string { + return fmt.Sprintf(`Available commands (-c): +%+v +Run 'orchestrator help ' for detailed help on given command, e.g. 'orchestrator help relocate' + +Usage for most commands: + orchestrator -c [-i [,]* ] [-d ] [--verbose|--debug] +`, commandsListing()) +} + +// getClusterName will make a best effort to deduce a cluster name using either a given alias +// or an instanceKey. First attempt is at alias, and if that doesn't work, we try instanceKey. +func getClusterName(clusterAlias string, instanceKey *inst.InstanceKey) (clusterName string) { + clusterName, _ = inst.FigureClusterName(clusterAlias, instanceKey, thisInstanceKey) + return clusterName +} + +func assignThisInstanceKey() *inst.InstanceKey { + log.Debugf("Assuming instance is this machine, %+v", thisInstanceKey) + return thisInstanceKey +} + +func validateInstanceIsFound(instanceKey *inst.InstanceKey) (instance *inst.Instance) { + instance, _, err := inst.ReadInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + return instance +} + +// CliWrapper is called from main and allows for the instance parameter +// to take multiple instance names separated by a comma or whitespace. +func CliWrapper(command string, strict bool, instances string, destination string, owner string, reason string, duration string, pattern string, clusterAlias string, pool string, hostnameFlag string) { + if config.Config.RaftEnabled && !*config.RuntimeCLIFlags.IgnoreRaftSetup { + log.Fatalf(`Orchestrator configured to run raft ("RaftEnabled": true). All access must go through the web API of the active raft node. You may use the orchestrator-client script which has a similar interface to the command line invocation. You may override this with --ignore-raft-setup`) + } + r := regexp.MustCompile(`[ ,\r\n\t]+`) + tokens := r.Split(instances, -1) + switch command { + case "submit-pool-instances": + { + // These commands unsplit the tokens (they expect a comma delimited list of instances) + tokens = []string{instances} + } + } + for _, instance := range tokens { + if instance != "" || len(tokens) == 1 { + Cli(command, strict, instance, destination, owner, reason, duration, pattern, clusterAlias, pool, hostnameFlag) + } + } +} + +// Cli initiates a command line interface, executing requested command. +func Cli(command string, strict bool, instance string, destination string, owner string, reason string, duration string, pattern string, clusterAlias string, pool string, hostnameFlag string) { + if synonym, ok := commandSynonyms[command]; ok { + command = synonym + } + + skipDatabaseCommands := false + switch command { + case "redeploy-internal-db": + skipDatabaseCommands = true + case "help": + skipDatabaseCommands = true + case "dump-config": + skipDatabaseCommands = true + } + + instanceKey, err := inst.ParseResolveInstanceKey(instance) + if err != nil { + instanceKey = nil + } + + rawInstanceKey, err := inst.ParseRawInstanceKey(instance) + if err != nil { + rawInstanceKey = nil + } + + if destination != "" && !strings.Contains(destination, ":") { + destination = fmt.Sprintf("%s:%d", destination, config.Config.DefaultInstancePort) + } + destinationKey, err := inst.ParseResolveInstanceKey(destination) + if err != nil { + destinationKey = nil + } + if !skipDatabaseCommands { + destinationKey = inst.ReadFuzzyInstanceKeyIfPossible(destinationKey) + } + if hostname, err := os.Hostname(); err == nil { + thisInstanceKey = &inst.InstanceKey{Hostname: hostname, Port: int(config.Config.DefaultInstancePort)} + } + postponedFunctionsContainer := inst.NewPostponedFunctionsContainer() + + if len(owner) == 0 { + // get os username as owner + usr, err := user.Current() + if err != nil { + log.Fatale(err) + } + owner = usr.Username + } + inst.SetMaintenanceOwner(owner) + + if !skipDatabaseCommands && !*config.RuntimeCLIFlags.SkipContinuousRegistration { + process.ContinuousRegistration(string(process.OrchestratorExecutionCliMode), command) + } + kv.InitKVStores() + + // begin commands + switch command { + // smart mode + case registerCliCommand("relocate", "Smart relocation", `Relocate a replica beneath another instance`), registerCliCommand("relocate-below", "Smart relocation", `Synonym to 'relocate', will be deprecated`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + _, err := inst.RelocateBelow(instanceKey, destinationKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) + } + case registerCliCommand("relocate-replicas", "Smart relocation", `Relocates all or part of the replicas of a given instance under another instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + replicas, _, err, errs := inst.RelocateReplicas(instanceKey, destinationKey, pattern) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range replicas { + fmt.Println(replica.Key.DisplayString()) + } + } + } + case registerCliCommand("take-siblings", "Smart relocation", `Turn all siblings of a replica into its sub-replicas.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + _, _, err := inst.TakeSiblings(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("regroup-replicas", "Smart relocation", `Given an instance, pick one of its replicas and make it local master of its siblings`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + validateInstanceIsFound(instanceKey) + + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(instanceKey, false, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + + postponedFunctionsContainer.Wait() + if promotedReplica == nil { + log.Fatalf("Could not regroup replicas of %+v; error: %+v", *instanceKey, err) + } + fmt.Println(fmt.Sprintf("%s lost: %d, trivial: %d, pseudo-gtid: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(equalReplicas), len(aheadReplicas))) + if err != nil { + log.Fatale(err) + } + } + // General replication commands + // move, binlog file:pos + case registerCliCommand("move-up", "Classic file:pos relocation", `Move a replica one level up the topology`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + instance, err := inst.MoveUp(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) + } + case registerCliCommand("move-up-replicas", "Classic file:pos relocation", `Moves replicas of the given instance one level up the topology`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + + movedReplicas, _, err, errs := inst.MoveUpReplicas(instanceKey, pattern) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range movedReplicas { + fmt.Println(replica.Key.DisplayString()) + } + } + } + case registerCliCommand("move-below", "Classic file:pos relocation", `Moves a replica beneath its sibling. Both replicas must be actively replicating from same master.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination/sibling:", destination) + } + _, err := inst.MoveBelow(instanceKey, destinationKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) + } + case registerCliCommand("move-equivalent", "Classic file:pos relocation", `Moves a replica beneath another server, based on previously recorded "equivalence coordinates"`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + _, err := inst.MoveEquivalent(instanceKey, destinationKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) + } + case registerCliCommand("repoint", "Classic file:pos relocation", `Make the given instance replicate from another instance without changing the binglog coordinates. Use with care`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + // destinationKey can be null, in which case the instance repoints to its existing master + instance, err := inst.Repoint(instanceKey, destinationKey, inst.GTIDHintNeutral) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) + } + case registerCliCommand("repoint-replicas", "Classic file:pos relocation", `Repoint all replicas of given instance to replicate back from the instance. Use with care`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + repointedReplicas, err, errs := inst.RepointReplicasTo(instanceKey, pattern, destinationKey) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range repointedReplicas { + fmt.Println(fmt.Sprintf("%s<%s", replica.Key.DisplayString(), instanceKey.DisplayString())) + } + } + } + case registerCliCommand("take-master", "Classic file:pos relocation", `Turn an instance into a master of its own master; essentially switch the two.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + _, err := inst.TakeMaster(instanceKey, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("make-co-master", "Classic file:pos relocation", `Create a master-master replication. Given instance is a replica which replicates directly from a master.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.MakeCoMaster(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("get-candidate-replica", "Classic file:pos relocation", `Information command suggesting the most up-to-date replica of a given instance that is good for promotion`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + + instance, _, _, _, _, err := inst.GetCandidateReplica(instanceKey, false) + if err != nil { + log.Fatale(err) + } else { + fmt.Println(instance.Key.DisplayString()) + } + } + case registerCliCommand("regroup-replicas-bls", "Binlog server relocation", `Regroup Binlog Server replicas of a given instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + validateInstanceIsFound(instanceKey) + + _, promotedBinlogServer, err := inst.RegroupReplicasBinlogServers(instanceKey, false) + if promotedBinlogServer == nil { + log.Fatalf("Could not regroup binlog server replicas of %+v; error: %+v", *instanceKey, err) + } + fmt.Println(promotedBinlogServer.Key.DisplayString()) + if err != nil { + log.Fatale(err) + } + } + // move, GTID + case registerCliCommand("move-gtid", "GTID relocation", `Move a replica beneath another instance.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + _, err := inst.MoveBelowGTID(instanceKey, destinationKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) + } + case registerCliCommand("move-replicas-gtid", "GTID relocation", `Moves all replicas of a given instance under another (destination) instance using GTID`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + movedReplicas, _, err, errs := inst.MoveReplicasGTID(instanceKey, destinationKey, pattern) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range movedReplicas { + fmt.Println(replica.Key.DisplayString()) + } + } + } + case registerCliCommand("regroup-replicas-gtid", "GTID relocation", `Given an instance, pick one of its replica and make it local master of its siblings, using GTID.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + validateInstanceIsFound(instanceKey) + + lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(instanceKey, false, func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) }, postponedFunctionsContainer, nil) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + + if promotedReplica == nil { + log.Fatalf("Could not regroup replicas of %+v; error: %+v", *instanceKey, err) + } + fmt.Println(fmt.Sprintf("%s lost: %d, moved: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(movedReplicas))) + if err != nil { + log.Fatale(err) + } + } + // Pseudo-GTID + case registerCliCommand("match", "Pseudo-GTID relocation", `Matches a replica beneath another (destination) instance using Pseudo-GTID`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + _, _, err := inst.MatchBelow(instanceKey, destinationKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), destinationKey.DisplayString())) + } + case registerCliCommand("match-up", "Pseudo-GTID relocation", `Transport the replica one level up the hierarchy, making it child of its grandparent, using Pseudo-GTID`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + instance, _, err := inst.MatchUp(instanceKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) + } + case registerCliCommand("rematch", "Pseudo-GTID relocation", `Reconnect a replica onto its master, via PSeudo-GTID.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + instance, _, err := inst.RematchReplica(instanceKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%s<%s", instanceKey.DisplayString(), instance.MasterKey.DisplayString())) + } + case registerCliCommand("match-replicas", "Pseudo-GTID relocation", `Matches all replicas of a given instance under another (destination) instance using Pseudo-GTID`): + { + // Move all replicas of "instance" beneath "destination" + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + if destinationKey == nil { + log.Fatal("Cannot deduce destination:", destination) + } + + matchedReplicas, _, err, errs := inst.MultiMatchReplicas(instanceKey, destinationKey, pattern) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range matchedReplicas { + fmt.Println(replica.Key.DisplayString()) + } + } + } + case registerCliCommand("match-up-replicas", "Pseudo-GTID relocation", `Matches replicas of the given instance one level up the topology, making them siblings of given instance, using Pseudo-GTID`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + + matchedReplicas, _, err, errs := inst.MatchUpReplicas(instanceKey, pattern) + if err != nil { + log.Fatale(err) + } else { + for _, e := range errs { + log.Errore(e) + } + for _, replica := range matchedReplicas { + fmt.Println(replica.Key.DisplayString()) + } + } + } + case registerCliCommand("regroup-replicas-pgtid", "Pseudo-GTID relocation", `Given an instance, pick one of its replica and make it local master of its siblings, using Pseudo-GTID.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + validateInstanceIsFound(instanceKey) + + onCandidateReplicaChosen := func(candidateReplica *inst.Instance) { fmt.Println(candidateReplica.Key.DisplayString()) } + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasPseudoGTID(instanceKey, false, onCandidateReplicaChosen, postponedFunctionsContainer, nil) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + postponedFunctionsContainer.Wait() + if promotedReplica == nil { + log.Fatalf("Could not regroup replicas of %+v; error: %+v", *instanceKey, err) + } + fmt.Println(fmt.Sprintf("%s lost: %d, trivial: %d, pseudo-gtid: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(equalReplicas), len(aheadReplicas))) + if err != nil { + log.Fatale(err) + } + } + // General replication commands + case registerCliCommand("enable-gtid", "Replication, general", `If possible, turn on GTID replication`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.EnableGTID(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("disable-gtid", "Replication, general", `Turn off GTID replication, back to file:pos replication`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.DisableGTID(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("which-gtid-errant", "Replication, general", `Get errant GTID set (empty results if no errant GTID)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + fmt.Println(instance.GtidErrant) + } + case registerCliCommand("gtid-errant-reset-master", "Replication, general", `Reset master on instance, remove GTID errant transactions`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.ErrantGTIDResetMaster(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("skip-query", "Replication, general", `Skip a single statement on a replica; either when running with GTID or without`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SkipQuery(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("stop-slave", "Replication, general", `Issue a STOP SLAVE on an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.StopReplication(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("start-slave", "Replication, general", `Issue a START SLAVE on an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.StartReplication(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("restart-slave", "Replication, general", `STOP and START SLAVE on an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.RestartReplication(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("reset-slave", "Replication, general", `Issues a RESET SLAVE command; use with care`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.ResetReplicationOperation(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("detach-replica-master-host", "Replication, general", `Stops replication and modifies Master_Host into an impossible, yet reversible, value.`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + _, err := inst.DetachReplicaMasterHost(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("reattach-replica-master-host", "Replication, general", `Undo a detach-replica-master-host operation`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + _, err := inst.ReattachReplicaMasterHost(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("master-pos-wait", "Replication, general", `Wait until replica reaches given replication coordinates (--binlog=file:pos)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + var binlogCoordinates *inst.BinlogCoordinates + + if binlogCoordinates, err = inst.ParseBinlogCoordinates(*config.RuntimeCLIFlags.BinlogFile); err != nil { + log.Fatalf("Expecing --binlog argument as file:pos") + } + _, err = inst.MasterPosWait(instanceKey, binlogCoordinates) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("enable-semi-sync-master", "Replication, general", `Enable semi-sync replication (master-side)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetSemiSyncMaster(instanceKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("disable-semi-sync-master", "Replication, general", `Disable semi-sync replication (master-side)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetSemiSyncMaster(instanceKey, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("enable-semi-sync-replica", "Replication, general", `Enable semi-sync replication (replica-side)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetSemiSyncReplica(instanceKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("disable-semi-sync-replica", "Replication, general", `Disable semi-sync replication (replica-side)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetSemiSyncReplica(instanceKey, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("restart-slave-statements", "Replication, general", `Get a list of statements to execute to stop then restore replica to same execution state. Provide --statement for injected statement`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + statements, err := inst.GetReplicationRestartPreserveStatements(instanceKey, *config.RuntimeCLIFlags.Statement) + if err != nil { + log.Fatale(err) + } + for _, statement := range statements { + fmt.Println(statement) + } + } + // Replication, information + case registerCliCommand("can-replicate-from", "Replication information", `Can an instance (-i) replicate from another (-d) according to replication rules? Prints 'true|false'`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce target instance:", destination) + } + otherInstance := validateInstanceIsFound(destinationKey) + + if canReplicate, _ := instance.CanReplicateFrom(otherInstance); canReplicate { + fmt.Println(destinationKey.DisplayString()) + } + } + case registerCliCommand("is-replicating", "Replication information", `Is an instance (-i) actively replicating right now`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + if instance.ReplicaRunning() { + fmt.Println(instance.Key.DisplayString()) + } + } + case registerCliCommand("is-replication-stopped", "Replication information", `Is an instance (-i) a replica with both replication threads stopped`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + if instance.ReplicationThreadsStopped() { + fmt.Println(instance.Key.DisplayString()) + } + } + // Instance + case registerCliCommand("set-read-only", "Instance", `Turn an instance read-only, via SET GLOBAL read_only := 1`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetReadOnly(instanceKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("set-writeable", "Instance", `Turn an instance writeable, via SET GLOBAL read_only := 0`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.SetReadOnly(instanceKey, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + // Binary log operations + case registerCliCommand("flush-binary-logs", "Binary logs", `Flush binary logs on an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + var err error + if *config.RuntimeCLIFlags.BinlogFile == "" { + _, err = inst.FlushBinaryLogs(instanceKey, 1) + } else { + _, err = inst.FlushBinaryLogsTo(instanceKey, *config.RuntimeCLIFlags.BinlogFile) + } + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("purge-binary-logs", "Binary logs", `Purge binary logs of an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + var err error + if *config.RuntimeCLIFlags.BinlogFile == "" { + log.Fatal("expecting --binlog value") + } + + _, err = inst.PurgeBinaryLogsTo(instanceKey, *config.RuntimeCLIFlags.BinlogFile, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("last-pseudo-gtid", "Binary logs", `Find latest Pseudo-GTID entry in instance's binary logs`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + coordinates, text, err := inst.FindLastPseudoGTIDEntry(instance, instance.RelaylogCoordinates, nil, strict, nil) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%+v:%s", *coordinates, text)) + } + case registerCliCommand("locate-gtid-errant", "Binary logs", `List binary logs containing errant GTIDs`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + errantBinlogs, err := inst.LocateErrantGTID(instanceKey) + if err != nil { + log.Fatale(err) + } + for _, binlog := range errantBinlogs { + fmt.Println(binlog) + } + } + case registerCliCommand("last-executed-relay-entry", "Binary logs", `Find coordinates of last executed relay log entry`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + minCoordinates, err := inst.GetPreviousKnownRelayLogCoordinatesForInstance(instance) + if err != nil { + log.Fatalf("Error reading last known coordinates for %+v: %+v", instance.Key, err) + } + binlogEvent, err := inst.GetLastExecutedEntryInRelayLogs(instance, minCoordinates, instance.RelaylogCoordinates) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%+v:%d", *binlogEvent, binlogEvent.NextEventPos)) + } + case registerCliCommand("correlate-relaylog-pos", "Binary logs", `Given an instance (-i) and relaylog coordinates (--binlog=file:pos), find the correlated coordinates in another instance's relay logs (-d)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + if destinationKey == nil { + log.Fatal("Cannot deduce target instance:", destination) + } + otherInstance, err := inst.ReadTopologyInstance(destinationKey) + if err != nil { + log.Fatale(err) + } + if otherInstance == nil { + log.Fatalf("Instance not found: %+v", *destinationKey) + } + + var relaylogCoordinates *inst.BinlogCoordinates + if *config.RuntimeCLIFlags.BinlogFile != "" { + if relaylogCoordinates, err = inst.ParseBinlogCoordinates(*config.RuntimeCLIFlags.BinlogFile); err != nil { + log.Fatalf("Expecing --binlog argument as file:pos") + } + } + instanceCoordinates, correlatedCoordinates, nextCoordinates, _, err := inst.CorrelateRelaylogCoordinates(instance, relaylogCoordinates, otherInstance) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%+v;%+v;%+v", *instanceCoordinates, *correlatedCoordinates, *nextCoordinates)) + } + case registerCliCommand("find-binlog-entry", "Binary logs", `Get binlog file:pos of entry given by --pattern (exact full match, not a regular expression) in a given instance`): + { + if pattern == "" { + log.Fatal("No pattern given") + } + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + coordinates, err := inst.SearchEntryInInstanceBinlogs(instance, pattern, false, nil) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%+v", *coordinates)) + } + case registerCliCommand("correlate-binlog-pos", "Binary logs", `Given an instance (-i) and binlog coordinates (--binlog=file:pos), find the correlated coordinates in another instance (-d)`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unresolved instance") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + if instance == nil { + log.Fatalf("Instance not found: %+v", *instanceKey) + } + if !instance.LogBinEnabled { + log.Fatalf("Instance does not have binary logs: %+v", *instanceKey) + } + if destinationKey == nil { + log.Fatal("Cannot deduce target instance:", destination) + } + otherInstance, err := inst.ReadTopologyInstance(destinationKey) + if err != nil { + log.Fatale(err) + } + if otherInstance == nil { + log.Fatalf("Instance not found: %+v", *destinationKey) + } + var binlogCoordinates *inst.BinlogCoordinates + if *config.RuntimeCLIFlags.BinlogFile == "" { + binlogCoordinates = &instance.SelfBinlogCoordinates + } else { + if binlogCoordinates, err = inst.ParseBinlogCoordinates(*config.RuntimeCLIFlags.BinlogFile); err != nil { + log.Fatalf("Expecing --binlog argument as file:pos") + } + } + + coordinates, _, err := inst.CorrelateBinlogCoordinates(instance, binlogCoordinates, otherInstance) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%+v", *coordinates)) + } + // Pool + case registerCliCommand("submit-pool-instances", "Pools", `Submit a pool name with a list of instances in that pool`): + { + if pool == "" { + log.Fatal("Please submit --pool") + } + err := inst.ApplyPoolInstances(inst.NewPoolInstancesSubmission(pool, instance)) + if err != nil { + log.Fatale(err) + } + } + case registerCliCommand("cluster-pool-instances", "Pools", `List all pools and their associated instances`): + { + clusterPoolInstances, err := inst.ReadAllClusterPoolInstances() + if err != nil { + log.Fatale(err) + } + for _, clusterPoolInstance := range clusterPoolInstances { + fmt.Println(fmt.Sprintf("%s\t%s\t%s\t%s:%d", clusterPoolInstance.ClusterName, clusterPoolInstance.ClusterAlias, clusterPoolInstance.Pool, clusterPoolInstance.Hostname, clusterPoolInstance.Port)) + } + } + case registerCliCommand("which-heuristic-cluster-pool-instances", "Pools", `List instances of a given cluster which are in either any pool or in a specific pool`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + + instances, err := inst.GetHeuristicClusterPoolInstances(clusterName, pool) + if err != nil { + log.Fatale(err) + } else { + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + } + // Information + case registerCliCommand("find", "Information", `Find instances whose hostname matches given regex pattern`): + { + if pattern == "" { + log.Fatal("No pattern given") + } + instances, err := inst.FindInstances(pattern) + if err != nil { + log.Fatale(err) + } else { + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + } + case registerCliCommand("search", "Information", `Search instances by name, version, version comment, port`): + { + if pattern == "" { + log.Fatal("No pattern given") + } + instances, err := inst.SearchInstances(pattern) + if err != nil { + log.Fatale(err) + } else { + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + } + case registerCliCommand("clusters", "Information", `List all clusters known to orchestrator`): + { + clusters, err := inst.ReadClusters() + if err != nil { + log.Fatale(err) + } + fmt.Println(strings.Join(clusters, "\n")) + } + case registerCliCommand("clusters-alias", "Information", `List all clusters known to orchestrator`): + { + clusters, err := inst.ReadClustersInfo("") + if err != nil { + log.Fatale(err) + } + for _, cluster := range clusters { + fmt.Println(fmt.Sprintf("%s\t%s", cluster.ClusterName, cluster.ClusterAlias)) + } + } + case registerCliCommand("all-clusters-masters", "Information", `List of writeable masters, one per cluster`): + { + instances, err := inst.ReadWriteableClustersMasters() + if err != nil { + log.Fatale(err) + } else { + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + } + case registerCliCommand("topology", "Information", `Show an ascii-graph of a replication topology, given a member of that topology`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + output, err := inst.ASCIITopology(clusterName, pattern, false, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(output) + } + case registerCliCommand("topology-tabulated", "Information", `Show an ascii-graph of a replication topology, given a member of that topology`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + output, err := inst.ASCIITopology(clusterName, pattern, true, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(output) + } + case registerCliCommand("topology-tags", "Information", `Show an ascii-graph of a replication topology and instance tags, given a member of that topology`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + output, err := inst.ASCIITopology(clusterName, pattern, false, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(output) + } + case registerCliCommand("all-instances", "Information", `The complete list of known instances`): + { + instances, err := inst.SearchInstances("") + if err != nil { + log.Fatale(err) + } else { + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + } + case registerCliCommand("which-instance", "Information", `Output the fully-qualified hostname:port representation of the given instance, or error if unknown`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unable to get master: unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + fmt.Println(instance.Key.DisplayString()) + } + case registerCliCommand("which-cluster", "Information", `Output the name of the cluster an instance belongs to, or error if unknown to orchestrator`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + fmt.Println(clusterName) + } + case registerCliCommand("which-cluster-alias", "Information", `Output the alias of the cluster an instance belongs to, or error if unknown to orchestrator`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + clusterInfo, err := inst.ReadClusterInfo(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(clusterInfo.ClusterAlias) + } + case registerCliCommand("which-cluster-domain", "Information", `Output the domain name of the cluster an instance belongs to, or error if unknown to orchestrator`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + clusterInfo, err := inst.ReadClusterInfo(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(clusterInfo.ClusterDomain) + } + case registerCliCommand("which-heuristic-domain-instance", "Information", `Returns the instance associated as the cluster's writer with a cluster's domain name.`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instanceKey, err := inst.GetHeuristicClusterDomainInstanceAttribute(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("which-cluster-master", "Information", `Output the name of the master in a given cluster`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + masters, err := inst.ReadClusterMaster(clusterName) + if err != nil { + log.Fatale(err) + } + if len(masters) == 0 { + log.Fatalf("No writeable masters found for cluster %+v", clusterName) + } + fmt.Println(masters[0].Key.DisplayString()) + } + case registerCliCommand("which-cluster-instances", "Information", `Output the list of instances participating in same cluster as given instance`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instances, err := inst.ReadClusterInstances(clusterName) + if err != nil { + log.Fatale(err) + } + for _, clusterInstance := range instances { + fmt.Println(clusterInstance.Key.DisplayString()) + } + } + case registerCliCommand("which-cluster-osc-replicas", "Information", `Output a list of replicas in a cluster, that could serve as a pt-online-schema-change operation control replicas`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instances, err := inst.GetClusterOSCReplicas(clusterName) + if err != nil { + log.Fatale(err) + } + for _, clusterInstance := range instances { + fmt.Println(clusterInstance.Key.DisplayString()) + } + } + case registerCliCommand("which-cluster-gh-ost-replicas", "Information", `Output a list of replicas in a cluster, that could serve as a gh-ost working server`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instances, err := inst.GetClusterGhostReplicas(clusterName) + if err != nil { + log.Fatale(err) + } + for _, clusterInstance := range instances { + fmt.Println(clusterInstance.Key.DisplayString()) + } + } + case registerCliCommand("which-master", "Information", `Output the fully-qualified hostname:port representation of a given instance's master`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unable to get master: unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + if instance.MasterKey.IsValid() { + fmt.Println(instance.MasterKey.DisplayString()) + } + } + case registerCliCommand("which-downtimed-instances", "Information", `List instances currently downtimed, potentially filtered by cluster`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instances, err := inst.ReadDowntimedInstances(clusterName) + if err != nil { + log.Fatale(err) + } + for _, clusterInstance := range instances { + fmt.Println(clusterInstance.Key.DisplayString()) + } + } + case registerCliCommand("which-replicas", "Information", `Output the fully-qualified hostname:port list of replicas of a given instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unable to get replicas: unresolved instance") + } + replicas, err := inst.ReadReplicaInstances(instanceKey) + if err != nil { + log.Fatale(err) + } + for _, replica := range replicas { + fmt.Println(replica.Key.DisplayString()) + } + } + case registerCliCommand("which-lost-in-recovery", "Information", `List instances marked as downtimed for being lost in a recovery process`): + { + instances, err := inst.ReadLostInRecoveryInstances("") + if err != nil { + log.Fatale(err) + } + for _, instance := range instances { + fmt.Println(instance.Key.DisplayString()) + } + } + case registerCliCommand("instance-status", "Information", `Output short status on a given instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatalf("Unable to get status: unresolved instance") + } + instance := validateInstanceIsFound(instanceKey) + fmt.Println(instance.HumanReadableDescription()) + } + case registerCliCommand("get-cluster-heuristic-lag", "Information", `For a given cluster (indicated by an instance or alias), output a heuristic "representative" lag of that cluster`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + lag, err := inst.GetClusterHeuristicLag(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(lag) + } + case registerCliCommand("submit-masters-to-kv-stores", "Key-value", `Submit master of a specific cluster, or all masters of all clusters to key-value stores`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + log.Debugf("cluster name is <%s>", clusterName) + + kvPairs, _, err := logic.SubmitMastersToKvStores(clusterName, true) + if err != nil { + log.Fatale(err) + } + for _, kvPair := range kvPairs { + fmt.Println(fmt.Sprintf("%s:%s", kvPair.Key, kvPair.Value)) + } + } + + case registerCliCommand("tags", "tags", `List tags for a given instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + tags, err := inst.ReadInstanceTags(instanceKey) + if err != nil { + log.Fatale(err) + } + for _, tag := range tags { + fmt.Println(tag.String()) + } + } + case registerCliCommand("tag-value", "tags", `Get tag value for a specific instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + tag, err := inst.ParseTag(*config.RuntimeCLIFlags.Tag) + if err != nil { + log.Fatale(err) + } + + tagExists, err := inst.ReadInstanceTag(instanceKey, tag) + if err != nil { + log.Fatale(err) + } + if tagExists { + fmt.Println(tag.TagValue) + } + } + case registerCliCommand("tagged", "tags", `List instances tagged by tag-string. Format: "tagname" or "tagname=tagvalue" or comma separated "tag0,tag1=val1,tag2" for intersection of all.`): + { + tagsString := *config.RuntimeCLIFlags.Tag + instanceKeyMap, err := inst.GetInstanceKeysByTags(tagsString) + if err != nil { + log.Fatale(err) + } + keysDisplayStrings := []string{} + for _, key := range instanceKeyMap.GetInstanceKeys() { + keysDisplayStrings = append(keysDisplayStrings, key.DisplayString()) + } + sort.Strings(keysDisplayStrings) + for _, s := range keysDisplayStrings { + fmt.Println(s) + } + } + case registerCliCommand("tag", "tags", `Add a tag to a given instance. Tag in "tagname" or "tagname=tagvalue" format`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + tag, err := inst.ParseTag(*config.RuntimeCLIFlags.Tag) + if err != nil { + log.Fatale(err) + } + inst.PutInstanceTag(instanceKey, tag) + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("untag", "tags", `Remove a tag from an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + tag, err := inst.ParseTag(*config.RuntimeCLIFlags.Tag) + if err != nil { + log.Fatale(err) + } + untagged, err := inst.Untag(instanceKey, tag) + if err != nil { + log.Fatale(err) + } + for _, key := range untagged.GetInstanceKeys() { + fmt.Println(key.DisplayString()) + } + } + case registerCliCommand("untag-all", "tags", `Remove a tag from all matching instances`): + { + tag, err := inst.ParseTag(*config.RuntimeCLIFlags.Tag) + if err != nil { + log.Fatale(err) + } + untagged, err := inst.Untag(nil, tag) + if err != nil { + log.Fatale(err) + } + for _, key := range untagged.GetInstanceKeys() { + fmt.Println(key.DisplayString()) + } + } + + // Instance management + case registerCliCommand("discover", "Instance management", `Lookup an instance, investigate it`): + { + if instanceKey == nil { + instanceKey = thisInstanceKey + } + if instanceKey == nil { + log.Fatalf("Cannot figure instance key") + } + instance, err := inst.ReadTopologyInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instance.Key.DisplayString()) + } + case registerCliCommand("forget", "Instance management", `Forget about an instance's existence`): + { + if rawInstanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + instanceKey, _ = inst.FigureInstanceKey(rawInstanceKey, nil) + err := inst.ForgetInstance(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("begin-maintenance", "Instance management", `Request a maintenance lock on an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if reason == "" { + log.Fatal("--reason option required") + } + var durationSeconds int = 0 + if duration != "" { + durationSeconds, err = util.SimpleTimeToSeconds(duration) + if err != nil { + log.Fatale(err) + } + if durationSeconds < 0 { + log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) + } + } + maintenanceKey, err := inst.BeginBoundedMaintenance(instanceKey, inst.GetMaintenanceOwner(), reason, uint(durationSeconds), true) + if err == nil { + log.Infof("Maintenance key: %+v", maintenanceKey) + log.Infof("Maintenance duration: %d seconds", durationSeconds) + } + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("end-maintenance", "Instance management", `Remove maintenance lock from an instance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.EndMaintenanceByInstanceKey(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("in-maintenance", "Instance management", `Check whether instance is under maintenance`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + inMaintenance, err := inst.InMaintenance(instanceKey) + if err != nil { + log.Fatale(err) + } + if inMaintenance { + fmt.Println(instanceKey.DisplayString()) + } + } + case registerCliCommand("begin-downtime", "Instance management", `Mark an instance as downtimed`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if reason == "" { + log.Fatal("--reason option required") + } + var durationSeconds int = 0 + if duration != "" { + durationSeconds, err = util.SimpleTimeToSeconds(duration) + if err != nil { + log.Fatale(err) + } + if durationSeconds < 0 { + log.Fatalf("Duration value must be non-negative. Given value: %d", durationSeconds) + } + } + duration := time.Duration(durationSeconds) * time.Second + err := inst.BeginDowntime(inst.NewDowntime(instanceKey, inst.GetMaintenanceOwner(), reason, duration)) + if err == nil { + log.Infof("Downtime duration: %d seconds", durationSeconds) + } else { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("end-downtime", "Instance management", `Indicate an instance is no longer downtimed`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + _, err := inst.EndDowntime(instanceKey) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + // Recovery & analysis + case registerCliCommand("recover", "Recovery", `Do auto-recovery given a dead instance`), registerCliCommand("recover-lite", "Recovery", `Do auto-recovery given a dead instance. Orchestrator chooses the best course of actionwithout executing external processes`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + if instanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + + recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(instanceKey, destinationKey, (command == "recover-lite")) + if err != nil { + log.Fatale(err) + } + if recoveryAttempted { + if promotedInstanceKey == nil { + log.Fatalf("Recovery attempted yet no replica promoted") + } + fmt.Println(promotedInstanceKey.DisplayString()) + } + } + case registerCliCommand("force-master-failover", "Recovery", `Forcibly discard master and initiate a failover, even if orchestrator doesn't see a problem. This command lets orchestrator choose the replacement master`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + topologyRecovery, err := logic.ForceMasterFailover(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + } + case registerCliCommand("force-master-takeover", "Recovery", `Forcibly discard master and promote another (direct child) instance instead, even if everything is running well`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + if destinationKey == nil { + log.Fatal("Cannot deduce destination, the instance to promote in place of the master. Please provide with -d") + } + destination := validateInstanceIsFound(destinationKey) + topologyRecovery, err := logic.ForceMasterTakeover(clusterName, destination) + if err != nil { + log.Fatale(err) + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + } + case registerCliCommand("graceful-master-takeover", "Recovery", `Gracefully promote a new master. Either indicate identity of new master via '-d designated.instance.com' or setup replication tree to have a single direct replica to the master.`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + if destinationKey != nil { + validateInstanceIsFound(destinationKey) + } + topologyRecovery, promotedMasterCoordinates, err := logic.GracefulMasterTakeover(clusterName, destinationKey, false) + if err != nil { + log.Fatale(err) + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + fmt.Println(*promotedMasterCoordinates) + log.Debugf("Promoted %+v as new master. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedMasterCoordinates) + } + case registerCliCommand("graceful-master-takeover-auto", "Recovery", `Gracefully promote a new master. orchestrator will attempt to pick the promoted replica automatically`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + // destinationKey doesn't _have_ to be specified: if unspecified, orchestrator will auto-deduce a replica. + // but if specified, then that's the replica to promote, and it must be valid. + if destinationKey != nil { + validateInstanceIsFound(destinationKey) + } + topologyRecovery, promotedMasterCoordinates, err := logic.GracefulMasterTakeover(clusterName, destinationKey, true) + if err != nil { + log.Fatale(err) + } + fmt.Println(topologyRecovery.SuccessorKey.DisplayString()) + fmt.Println(*promotedMasterCoordinates) + log.Debugf("Promoted %+v as new master. Binlog coordinates at time of promotion: %+v", topologyRecovery.SuccessorKey, *promotedMasterCoordinates) + } + case registerCliCommand("replication-analysis", "Recovery", `Request an analysis of potential crash incidents in all known topologies`): + { + analysis, err := inst.GetReplicationAnalysis("", &inst.ReplicationAnalysisHints{}) + if err != nil { + log.Fatale(err) + } + for _, entry := range analysis { + fmt.Println(fmt.Sprintf("%s (cluster %s): %s", entry.AnalyzedInstanceKey.DisplayString(), entry.ClusterDetails.ClusterName, entry.AnalysisString())) + } + } + case registerCliCommand("ack-all-recoveries", "Recovery", `Acknowledge all recoveries; this unblocks pending future recoveries`): + { + if reason == "" { + log.Fatal("--reason option required (comment your ack)") + } + countRecoveries, err := logic.AcknowledgeAllRecoveries(inst.GetMaintenanceOwner(), reason) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%d recoveries acknowldged", countRecoveries)) + } + case registerCliCommand("ack-cluster-recoveries", "Recovery", `Acknowledge recoveries for a given cluster; this unblocks pending future recoveries`): + { + if reason == "" { + log.Fatal("--reason option required (comment your ack)") + } + clusterName := getClusterName(clusterAlias, instanceKey) + countRecoveries, err := logic.AcknowledgeClusterRecoveries(clusterName, inst.GetMaintenanceOwner(), reason) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%d recoveries acknowldged", countRecoveries)) + } + case registerCliCommand("ack-instance-recoveries", "Recovery", `Acknowledge recoveries for a given instance; this unblocks pending future recoveries`): + { + if reason == "" { + log.Fatal("--reason option required (comment your ack)") + } + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + + countRecoveries, err := logic.AcknowledgeInstanceRecoveries(instanceKey, inst.GetMaintenanceOwner(), reason) + if err != nil { + log.Fatale(err) + } + fmt.Println(fmt.Sprintf("%d recoveries acknowldged", countRecoveries)) + } + // Instance meta + case registerCliCommand("register-candidate", "Instance, meta", `Indicate that a specific instance is a preferred candidate for master promotion`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + promotionRule, err := inst.ParseCandidatePromotionRule(*config.RuntimeCLIFlags.PromotionRule) + if err != nil { + log.Fatale(err) + } + err = inst.RegisterCandidateInstance(inst.NewCandidateDatabaseInstance(instanceKey, promotionRule).WithCurrentTime()) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("register-hostname-unresolve", "Instance, meta", `Assigns the given instance a virtual (aka "unresolved") name`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + err := inst.RegisterHostnameUnresolve(inst.NewHostnameRegistration(instanceKey, hostnameFlag)) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("deregister-hostname-unresolve", "Instance, meta", `Explicitly deregister/dosassociate a hostname with an "unresolved" name`): + { + instanceKey, _ = inst.FigureInstanceKey(instanceKey, thisInstanceKey) + err := inst.RegisterHostnameUnresolve(inst.NewHostnameDeregistration(instanceKey)) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + case registerCliCommand("set-heuristic-domain-instance", "Instance, meta", `Associate domain name of given cluster with what seems to be the writer master for that cluster`): + { + clusterName := getClusterName(clusterAlias, instanceKey) + instanceKey, err := inst.HeuristicallyApplyClusterDomainInstanceAttribute(clusterName) + if err != nil { + log.Fatale(err) + } + fmt.Println(instanceKey.DisplayString()) + } + + // meta + case registerCliCommand("snapshot-topologies", "Meta", `Take a snapshot of existing topologies.`): + { + err := inst.SnapshotTopologies() + if err != nil { + log.Fatale(err) + } + } + case registerCliCommand("continuous", "Meta", `Enter continuous mode, and actively poll for instances, diagnose problems, do maintenance`): + { + logic.ContinuousDiscovery() + } + case registerCliCommand("active-nodes", "Meta", `List currently active orchestrator nodes`): + { + nodes, err := process.ReadAvailableNodes(false) + if err != nil { + log.Fatale(err) + } + for _, node := range nodes { + fmt.Println(node) + } + } + case registerCliCommand("access-token", "Meta", `Get a HTTP access token`): + { + publicToken, err := process.GenerateAccessToken(owner) + if err != nil { + log.Fatale(err) + } + fmt.Println(publicToken) + } + case registerCliCommand("resolve", "Meta", `Resolve given hostname`): + { + if rawInstanceKey == nil { + log.Fatal("Cannot deduce instance:", instance) + } + if conn, err := net.Dial("tcp", rawInstanceKey.DisplayString()); err == nil { + log.Debugf("tcp test is good; got connection %+v", conn) + conn.Close() + } else { + log.Fatale(err) + } + if cname, err := inst.GetCNAME(rawInstanceKey.Hostname); err == nil { + log.Debugf("GetCNAME() %+v, %+v", cname, err) + rawInstanceKey.Hostname = cname + fmt.Println(rawInstanceKey.DisplayString()) + } else { + log.Fatale(err) + } + } + case registerCliCommand("reset-hostname-resolve-cache", "Meta", `Clear the hostname resolve cache`): + { + err := inst.ResetHostnameResolveCache() + if err != nil { + log.Fatale(err) + } + fmt.Println("hostname resolve cache cleared") + } + case registerCliCommand("dump-config", "Meta", `Print out configuration in JSON format`): + { + jsonString := config.Config.ToJSONString() + fmt.Println(jsonString) + } + case registerCliCommand("show-resolve-hosts", "Meta", `Show the content of the hostname_resolve table. Generally used for debugging`): + { + resolves, err := inst.ReadAllHostnameResolves() + if err != nil { + log.Fatale(err) + } + for _, r := range resolves { + fmt.Println(r) + } + } + case registerCliCommand("show-unresolve-hosts", "Meta", `Show the content of the hostname_unresolve table. Generally used for debugging`): + { + unresolves, err := inst.ReadAllHostnameUnresolves() + if err != nil { + log.Fatale(err) + } + for _, r := range unresolves { + fmt.Println(r) + } + } + case registerCliCommand("redeploy-internal-db", "Meta, internal", `Force internal schema migration to current backend structure`): + { + config.RuntimeCLIFlags.ConfiguredVersion = "" + _, err := inst.ReadClusters() + if err != nil { + log.Fatale(err) + } + fmt.Println("Redeployed internal db") + } + case registerCliCommand("internal-suggest-promoted-replacement", "Internal", `Internal only, used to test promotion logic in CI`): + { + destination := validateInstanceIsFound(destinationKey) + replacement, _, err := logic.SuggestReplacementForPromotedReplica(&logic.TopologyRecovery{}, instanceKey, destination, nil) + if err != nil { + log.Fatale(err) + } + fmt.Println(replacement.Key.DisplayString()) + } + case registerCliCommand("custom-command", "Agent", "Execute a custom command on the agent as defined in the agent conf"): + { + output, err := agent.CustomCommand(hostnameFlag, pattern) + if err != nil { + log.Fatale(err) + } + + fmt.Printf("%v\n", output) + } + case registerCliCommand("disable-global-recoveries", "", `Disallow orchestrator from performing recoveries globally`): + { + if err := logic.DisableRecovery(); err != nil { + log.Fatalf("ERROR: Failed to disable recoveries globally: %v\n", err) + } + fmt.Println("OK: Orchestrator recoveries DISABLED globally") + } + case registerCliCommand("enable-global-recoveries", "", `Allow orchestrator to perform recoveries globally`): + { + if err := logic.EnableRecovery(); err != nil { + log.Fatalf("ERROR: Failed to enable recoveries globally: %v\n", err) + } + fmt.Println("OK: Orchestrator recoveries ENABLED globally") + } + case registerCliCommand("check-global-recoveries", "", `Show the global recovery configuration`): + { + isDisabled, err := logic.IsRecoveryDisabled() + if err != nil { + log.Fatalf("ERROR: Failed to determine if recoveries are disabled globally: %v\n", err) + } + fmt.Printf("OK: Global recoveries disabled: %v\n", isDisabled) + } + case registerCliCommand("bulk-instances", "", `Return a list of sorted instance names known to orchestrator`): + { + instances, err := inst.BulkReadInstance() + if err != nil { + log.Fatalf("Error: Failed to retrieve instances: %v\n", err) + return + } + var asciiInstances stringSlice + for _, v := range instances { + asciiInstances = append(asciiInstances, v.String()) + } + sort.Sort(asciiInstances) + fmt.Printf("%s\n", strings.Join(asciiInstances, "\n")) + } + case registerCliCommand("bulk-promotion-rules", "", `Return a list of promotion rules known to orchestrator`): + { + promotionRules, err := inst.BulkReadCandidateDatabaseInstance() + if err != nil { + log.Fatalf("Error: Failed to retrieve promotion rules: %v\n", err) + } + var asciiPromotionRules stringSlice + for _, v := range promotionRules { + asciiPromotionRules = append(asciiPromotionRules, v.String()) + } + sort.Sort(asciiPromotionRules) + + fmt.Printf("%s\n", strings.Join(asciiPromotionRules, "\n")) + } + // Help + case "help": + { + fmt.Fprintf(os.Stderr, availableCommandsUsage()) + } + default: + log.Fatalf("Unknown command: \"%s\". %s", command, availableCommandsUsage()) + } +} diff --git a/go/vt/orchestrator/app/cli_test.go b/go/vt/orchestrator/app/cli_test.go new file mode 100644 index 0000000000..9461d92b43 --- /dev/null +++ b/go/vt/orchestrator/app/cli_test.go @@ -0,0 +1,37 @@ +package app + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestHelp(t *testing.T) { + Cli("help", false, "localhost:9999", "localhost:9999", "orc", "no-reason", "1m", ".", "no-alias", "no-pool", "") + test.S(t).ExpectTrue(len(knownCommands) > 0) +} + +func TestKnownCommands(t *testing.T) { + Cli("help", false, "localhost:9999", "localhost:9999", "orc", "no-reason", "1m", ".", "no-alias", "no-pool", "") + + commandsMap := make(map[string]string) + for _, command := range knownCommands { + commandsMap[command.Command] = command.Section + } + test.S(t).ExpectEquals(commandsMap["no-such-command"], "") + test.S(t).ExpectEquals(commandsMap["relocate"], "Smart relocation") + test.S(t).ExpectEquals(commandsMap["relocate-slaves"], "") + test.S(t).ExpectEquals(commandsMap["relocate-replicas"], "Smart relocation") + + for _, synonym := range commandSynonyms { + test.S(t).ExpectNotEquals(commandsMap[synonym], "") + } +} diff --git a/go/vt/orchestrator/app/command_help.go b/go/vt/orchestrator/app/command_help.go new file mode 100644 index 0000000000..2c04a0ceda --- /dev/null +++ b/go/vt/orchestrator/app/command_help.go @@ -0,0 +1,907 @@ +/* + Copyright 2016 GitHub Inc. + See https://github.com/openark/orchestrator/blob/master/LICENSE +*/ + +package app + +import ( + "fmt" + "strings" +) + +const AppPrompt string = ` +orchestrator [-c command] [-i instance] [-d destination] [--verbose|--debug] [... cli ] | http + +Cheatsheet: + Run orchestrator in HTTP mode: + + orchestrator --debug http + + See all possible commands: + + orchestrator help + + Detailed help for a given command (e.g. "relocate") + + orchestrator help relocate +` + +var CommandHelp map[string]string + +func init() { + CommandHelp = make(map[string]string) + CommandHelp["relocate"] = ` + Relocate a replica beneath another (destination) instance. The choice of destination is almost arbitrary; + it must not be a child/descendant of the instance, but otherwise it can be anywhere, and can be a normal replica + or a binlog server. Orchestrator will choose the best course of action to relocate the replica. + No action taken when destination instance cannot act as master (e.g. has no binary logs, is of incompatible version, incompatible binlog format etc.) + Examples: + + orchestrator -c relocate -i replica.to.relocate.com -d instance.that.becomes.its.master + + orchestrator -c relocate -d destination.instance.that.becomes.its.master + -i not given, implicitly assumed local hostname + + (this command was previously named "relocate-below") + ` + CommandHelp["relocate-replicas"] = ` + Relocates all or part of the replicas of a given instance under another (destination) instance. This is + typically much faster than relocating replicas one by one. + Orchestrator chooses the best course of action to relocation the replicas. It may choose a multi-step operations. + Some replicas may succeed and some may fail the operation. + The instance (replicas' master) itself may be crashed or inaccessible. It is not contacted throughout the operation. + Examples: + + orchestrator -c relocate-replicas -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master + + orchestrator -c relocate-replicas -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master --pattern=regexp.filter + only apply to those instances that match given regex + ` + CommandHelp["move-up-replicas"] = ` + Moves replicas of the given instance one level up the topology, making them siblings of given instance. + This is a (faster) shortcut to executing move-up on all replicas of given instance. + Examples: + + orchestrator -c move-up-replicas -i replica.whose.subreplicas.will.move.up.com[:3306] + + orchestrator -c move-up-replicas -i replica.whose.subreplicas.will.move.up.com[:3306] --pattern=regexp.filter + only apply to those instances that match given regex + ` + CommandHelp["move-below"] = ` + Moves a replica beneath its sibling. Both replicas must be actively replicating from same master. + The sibling will become instance's master. No action taken when sibling cannot act as master + (e.g. has no binary logs, is of incompatible version, incompatible binlog format etc.) + Example: + + orchestrator -c move-below -i replica.to.move.com -d sibling.replica.under.which.to.move.com + + orchestrator -c move-below -d sibling.replica.under.which.to.move.com + -i not given, implicitly assumed local hostname + ` + CommandHelp["move-equivalent"] = ` + Moves a replica beneath another server, based on previously recorded "equivalence coordinates". Such coordinates + are obtained whenever orchestrator issues a CHANGE MASTER TO. The "before" and "after" masters coordinates are + persisted. In such cases where the newly relocated replica is unable to replicate (e.g. firewall issues) it is then + easy to revert the relocation via "move-equivalent". + The command works if and only if orchestrator has an exact mapping between the replica's current replication coordinates + and some other coordinates. + Example: + + orchestrator -c move-equivalent -i replica.to.revert.master.position.com -d master.to.move.to.com + ` + CommandHelp["take-siblings"] = ` + Turn all siblings of a replica into its sub-replicas. No action taken for siblings that cannot become + replicas of given instance (e.g. incompatible versions, binlog format etc.). This is a (faster) shortcut + to executing move-below for all siblings of the given instance. Example: + + orchestrator -c take-siblings -i replica.whose.siblings.will.move.below.com + ` + CommandHelp["take-master"] = ` + Turn an instance into a master of its own master; essentially switch the two. Replicas of each of the two + involved instances are unaffected, and continue to replicate as they were. + The instance's master must itself be a replica. It does not necessarily have to be actively replicating. + + orchestrator -c take-master -i replica.that.will.switch.places.with.its.master.com + ` + CommandHelp["repoint"] = ` + Make the given instance replicate from another instance without changing the binglog coordinates. There + are little sanity checks to this and this is a risky operation. Use cases are: a rename of the master's + host, a corruption in relay-logs, move from beneath MaxScale & Binlog-server. Examples: + + orchestrator -c repoint -i replica.to.operate.on.com -d new.master.com + + orchestrator -c repoint -i replica.to.operate.on.com + The above will repoint the replica back to its existing master without change + + orchestrator -c repoint + -i not given, implicitly assumed local hostname + ` + CommandHelp["repoint-replicas"] = ` + Repoint all replicas of given instance to replicate back from the instance. This is a convenience method + which implies a one-by-one "repoint" command on each replica. + + orchestrator -c repoint-replicas -i instance.whose.replicas.will.be.repointed.com + + orchestrator -c repoint-replicas + -i not given, implicitly assumed local hostname + ` + CommandHelp["make-co-master"] = ` + Create a master-master replication. Given instance is a replica which replicates directly from a master. + The master is then turned to be a replica of the instance. The master is expected to not be a replica. + The read_only property of the slve is unaffected by this operation. Examples: + + orchestrator -c make-co-master -i replica.to.turn.into.co.master.com + + orchestrator -c make-co-master + -i not given, implicitly assumed local hostname + ` + CommandHelp["get-candidate-replica"] = ` + Information command suggesting the most up-to-date replica of a given instance, which can be promoted + as local master to its siblings. If replication is up and running, this command merely gives an + estimate, since replicas advance and progress continuously in different pace. If all replicas of given + instance have broken replication (e.g. because given instance is dead), then this command provides + with a definitve candidate, which could act as a replace master. See also regroup-replicas. Example: + + orchestrator -c get-candidate-replica -i instance.with.replicas.one.of.which.may.be.candidate.com + ` + CommandHelp["regroup-replicas-bls"] = ` + Given an instance that has Binlog Servers for replicas, promote one such Binlog Server over its other + Binlog Server siblings. + + Example: + + orchestrator -c regroup-replicas-bls -i instance.with.binlog.server.replicas.com + + --debug is your friend. + ` + CommandHelp["move-gtid"] = ` + Move a replica beneath another (destination) instance. Orchestrator will reject the operation if GTID is + not enabled on the replica, or is not supported by the would-be master. + You may try and move the replica under any other instance; there are no constraints on the family ties the + two may have, though you should be careful as not to try and replicate from a descendant (making an + impossible loop). + Examples: + + orchestrator -c move-gtid -i replica.to.move.com -d instance.that.becomes.its.master + + orchestrator -c match -d destination.instance.that.becomes.its.master + -i not given, implicitly assumed local hostname + ` + CommandHelp["move-replicas-gtid"] = ` + Moves all replicas of a given instance under another (destination) instance using GTID. This is a (faster) + shortcut to moving each replica via "move-gtid". + Orchestrator will only move those replica configured with GTID (either Oracle or MariaDB variants) and under the + condition the would-be master supports GTID. + Examples: + + orchestrator -c move-replicas-gtid -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master + + orchestrator -c move-replicas-gtid -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master --pattern=regexp.filter + only apply to those instances that match given regex + ` + CommandHelp["regroup-replicas-gtid"] = ` + Given an instance (possibly a crashed one; it is never being accessed), pick one of its replica and make it + local master of its siblings, using GTID. The rules are similar to those in the "regroup-replicas" command. + Example: + + orchestrator -c regroup-replicas-gtid -i instance.with.gtid.and.replicas.one.of.which.will.turn.local.master.if.possible + + --debug is your friend. + ` + CommandHelp["match"] = ` + Matches a replica beneath another (destination) instance. The choice of destination is almost arbitrary; + it must not be a child/descendant of the instance. But otherwise they don't have to be direct siblings, + and in fact (if you know what you're doing), they don't actually have to belong to the same topology. + The operation expects the relocated instance to be "behind" the destination instance. It only finds out + whether this is the case by the end; the operation is cancelled in the event this is not the case. + No action taken when destination instance cannot act as master (e.g. has no binary logs, is of incompatible version, incompatible binlog format etc.) + Examples: + + orchestrator -c match -i replica.to.relocate.com -d instance.that.becomes.its.master + + orchestrator -c match -d destination.instance.that.becomes.its.master + -i not given, implicitly assumed local hostname + + (this command was previously named "match-below") + ` + CommandHelp["match-replicas"] = ` + Matches all replicas of a given instance under another (destination) instance. This is a (faster) shortcut + to matching said replicas one by one under the destination instance. In fact, this bulk operation is highly + optimized and can execute in orders of magnitue faster, depeding on the nu,ber of replicas involved and their + respective position behind the instance (the more replicas, the more savings). + The instance itself may be crashed or inaccessible. It is not contacted throughout the operation. Examples: + + orchestrator -c match-replicas -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master + + orchestrator -c match-replicas -i instance.whose.replicas.will.relocate -d instance.that.becomes.their.master --pattern=regexp.filter + only apply to those instances that match given regex + + (this command was previously named "multi-match-replicas") + ` + CommandHelp["match-up"] = ` + Transport the replica one level up the hierarchy, making it child of its grandparent. This is + similar in essence to move-up, only based on Pseudo-GTID. The master of the given instance + does not need to be alive or connected (and could in fact be crashed). It is never contacted. + Grandparent instance must be alive and accessible. + Examples: + + orchestrator -c match-up -i replica.to.match.up.com:3306 + + orchestrator -c match-up + -i not given, implicitly assumed local hostname + ` + CommandHelp["match-up-replicas"] = ` + Matches replicas of the given instance one level up the topology, making them siblings of given instance. + This is a (faster) shortcut to executing match-up on all replicas of given instance. The instance need + not be alive / accessib;e / functional. It can be crashed. + Example: + + orchestrator -c match-up-replicas -i replica.whose.subreplicas.will.match.up.com + + orchestrator -c match-up-replicas -i replica.whose.subreplicas.will.match.up.com[:3306] --pattern=regexp.filter + only apply to those instances that match given regex + ` + CommandHelp["rematch"] = ` + Reconnect a replica onto its master, via PSeudo-GTID. The use case for this operation is a non-crash-safe + replication configuration (e.g. MySQL 5.5) with sync_binlog=1 and log_slave_updates. This operation + implies crash-safe-replication and makes it possible for the replica to reconnect. Example: + + orchestrator -c rematch -i replica.to.rematch.under.its.master + ` + CommandHelp["regroup-replicas"] = ` + Given an instance (possibly a crashed one; it is never being accessed), pick one of its replica and make it + local master of its siblings, using Pseudo-GTID. It is uncertain that there *is* a replica that will be able to + become master to all its siblings. But if there is one, orchestrator will pick such one. There are many + constraints, most notably the replication positions of all replicas, whether they use log_slave_updates, and + otherwise version compatabilities etc. + As many replicas that can be regrouped under promoted slves are operated on. The rest are untouched. + This command is useful in the event of a crash. For example, in the event that a master dies, this operation + can promote a candidate replacement and set up the remaining topology to correctly replicate from that + replacement replica. Example: + + orchestrator -c regroup-replicas -i instance.with.replicas.one.of.which.will.turn.local.master.if.possible + + --debug is your friend. + ` + + CommandHelp["enable-gtid"] = ` + If possible, enable GTID replication. This works on Oracle (>= 5.6, gtid-mode=1) and MariaDB (>= 10.0). + Replication is stopped for a short duration so as to reconfigure as GTID. In case of error replication remains + stopped. Example: + + orchestrator -c enable-gtid -i replica.compatible.with.gtid.com + ` + CommandHelp["disable-gtid"] = ` + Assuming replica replicates via GTID, disable GTID replication and resume standard file:pos replication. Example: + + orchestrator -c disable-gtid -i replica.replicating.via.gtid.com + ` + CommandHelp["reset-master-gtid-remove-own-uuid"] = ` + Assuming GTID is enabled, Reset master on instance, remove GTID entries generated by the instance. + This operation is only allowed on Oracle-GTID enabled servers that have no replicas. + Is is used for cleaning up the GTID mess incurred by mistakenly issuing queries on the replica (even such + queries as "FLUSH ENGINE LOGS" that happen to write to binary logs). Example: + + orchestrator -c reset-master-gtid-remove-own-uuid -i replica.running.with.gtid.com + ` + CommandHelp["stop-slave"] = ` + Issues a STOP SLAVE; command. Example: + + orchestrator -c stop-slave -i replica.to.be.stopped.com + ` + CommandHelp["start-slave"] = ` + Issues a START SLAVE; command. Example: + + orchestrator -c start-slave -i replica.to.be.started.com + ` + CommandHelp["restart-slave"] = ` + Issues STOP SLAVE + START SLAVE; Example: + + orchestrator -c restart-slave -i replica.to.be.started.com + ` + CommandHelp["skip-query"] = ` + On a failed replicating replica, skips a single query and attempts to resume replication. + Only applies when the replication seems to be broken on SQL thread (e.g. on duplicate + key error). Also works in GTID mode. Example: + + orchestrator -c skip-query -i replica.with.broken.sql.thread.com + ` + CommandHelp["reset-slave"] = ` + Issues a RESET SLAVE command. Destructive to replication. Example: + + orchestrator -c reset-slave -i replica.to.reset.com + ` + CommandHelp["detach-replica"] = ` + Stops replication and modifies binlog position into an impossible, yet reversible, value. + This effectively means the replication becomes broken. See reattach-replica. Example: + + orchestrator -c detach-replica -i replica.whose.replication.will.break.com + + Issuing this on an already detached replica will do nothing. + ` + CommandHelp["reattach-replica"] = ` + Undo a detach-replica operation. Reverses the binlog change into the original values, and + resumes replication. Example: + + orchestrator -c reattach-replica -i detahced.replica.whose.replication.will.amend.com + + Issuing this on an attached (i.e. normal) replica will do nothing. + ` + CommandHelp["detach-replica-master-host"] = ` + Stops replication and modifies Master_Host into an impossible, yet reversible, value. + This effectively means the replication becomes broken. See reattach-replica-master-host. Example: + + orchestrator -c detach-replica-master-host -i replica.whose.replication.will.break.com + + Issuing this on an already detached replica will do nothing. + ` + CommandHelp["reattach-replica-master-host"] = ` + Undo a detach-replica-master-host operation. Reverses the hostname change into the original value, and + resumes replication. Example: + + orchestrator -c reattach-replica-master-host -i detahced.replica.whose.replication.will.amend.com + + Issuing this on an attached (i.e. normal) replica will do nothing. + ` + CommandHelp["restart-slave-statements"] = ` + Prints a list of statements to execute to stop then restore replica to same execution state. + Provide --statement for injected statement. + This is useful for issuing a command that can only be executed while replica is stopped. Such + commands are any of CHANGE MASTER TO. + Orchestrator will not execute given commands, only print them as courtesy. It may not have + the privileges to execute them in the first place. Example: + + orchestrator -c restart-slave-statements -i some.replica.com -statement="change master to master_heartbeat_period=5" + ` + + CommandHelp["set-read-only"] = ` + Turn an instance read-only, via SET GLOBAL read_only := 1. Examples: + + orchestrator -c set-read-only -i instance.to.turn.read.only.com + + orchestrator -c set-read-only + -i not given, implicitly assumed local hostname + ` + CommandHelp["set-writeable"] = ` + Turn an instance writeable, via SET GLOBAL read_only := 0. Example: + + orchestrator -c set-writeable -i instance.to.turn.writeable.com + + orchestrator -c set-writeable + -i not given, implicitly assumed local hostname + ` + + CommandHelp["flush-binary-logs"] = ` + Flush binary logs on an instance. Examples: + + orchestrator -c flush-binary-logs -i instance.with.binary.logs.com + + orchestrator -c flush-binary-logs -i instance.with.binary.logs.com --binlog=mysql-bin.002048 + Flushes binary logs until reaching given number. Fails when current number is larger than input + ` + CommandHelp["purge-binary-logs"] = ` + Purge binary logs on an instance. Examples: + + orchestrator -c purge-binary-logs -i instance.with.binary.logs.com --binlog mysql-bin.002048 + + Purges binary logs until given log + ` + CommandHelp["last-pseudo-gtid"] = ` + Information command; an authoritative way of detecting whether a Pseudo-GTID event exist for an instance, + and if so, output the last Pseudo-GTID entry and its location. Example: + + orchestrator -c last-pseudo-gtid -i instance.with.possible.pseudo-gtid.injection + ` + CommandHelp["find-binlog-entry"] = ` + Get binlog file:pos of entry given by --pattern (exact full match, not a regular expression) in a given instance. + This will search the instance's binary logs starting with most recent, and terminate as soon as an exact match is found. + The given input is not a regular expression. It must fully match the entry (not a substring). + This is most useful when looking for uniquely identifyable values, such as Pseudo-GTID. Example: + + orchestrator -c find-binlog-entry -i instance.to.search.on.com --pattern "insert into my_data (my_column) values ('distinct_value_01234_56789')" + + Prints out the binlog file:pos where the entry is found, or errors if unfound. + ` + CommandHelp["correlate-binlog-pos"] = ` + Given an instance (-i) and binlog coordinates (--binlog=file:pos), find the correlated coordinates in another instance (-d). + "Correlated coordinates" are those that present the same point-in-time of sequence of binary log events, untangling + the mess of different binlog file:pos coordinates on different servers. + This operation relies on Pseudo-GTID: your servers must have been pre-injected with PSeudo-GTID entries as these are + being used as binlog markers in the correlation process. + You must provide a valid file:pos in the binlogs of the source instance (-i), and in response get the correlated + coordinates in the binlogs of the destination instance (-d). This operation does not work on relay logs. + Example: + + orchestrator -c correlate-binlog-pos -i instance.with.binary.log.com --binlog=mysql-bin.002366:14127 -d other.instance.with.binary.logs.com + + Prints out correlated coordinates, e.g.: "mysql-bin.002302:14220", or errors out. + ` + + CommandHelp["submit-pool-instances"] = ` + Submit a pool name with a list of instances in that pool. This removes any previous instances associated with + that pool. Expecting comma delimited list of instances + + orchestrator -c submit-pool-instances --pool name_of_pool -i pooled.instance1.com,pooled.instance2.com:3306,pooled.instance3.com + ` + CommandHelp["cluster-pool-instances"] = ` + List all pools and their associated instances. Output is in tab delimited format, and lists: + cluster_name, cluster_alias, pool_name, pooled instance + Example: + + orchestrator -c cluster-pool-instances + ` + CommandHelp["which-heuristic-cluster-pool-instances"] = ` + List instances belonging to a cluster, which are also in some pool or in a specific given pool. + Not all instances are listed: unreachable, downtimed instances ar left out. Only those that should be + responsive and healthy are listed. This serves applications in getting information about instances + that could be queried (this complements a proxy behavior in providing the *list* of instances). + Examples: + + orchestrator -c which-heuristic-cluster-pool-instances --alias mycluster + Get the instances of a specific cluster, no specific pool + + orchestrator -c which-heuristic-cluster-pool-instances --alias mycluster --pool some_pool + Get the instances of a specific cluster and which belong to a given pool + + orchestrator -c which-heuristic-cluster-pool-instances -i instance.belonging.to.a.cluster + Cluster inferred by given instance + + orchestrator -c which-heuristic-cluster-pool-instances + Cluster inferred by local hostname + ` + + CommandHelp["find"] = ` + Find instances whose hostname matches given regex pattern. Example: + + orchestrator -c find -pattern "backup.*us-east" + ` + CommandHelp["clusters"] = ` + List all clusters known to orchestrator. A cluster (aka topology, aka chain) is identified by its + master (or one of its master if more than one exists). Example: + + orchestrator -c clusters + -i not given, implicitly assumed local hostname + ` + CommandHelp["all-clusters-masters"] = ` + List of writeable masters, one per cluster. + For most single-master topologies, this is trivially the master. + For active-active master-master topologies, this ensures only one of + the masters is returned. Example: + + orchestrator -c all-clusters-masters + ` + CommandHelp["topology"] = ` + Show an ascii-graph of a replication topology, given a member of that topology. Example: + + orchestrator -c topology -i instance.belonging.to.a.topology.com + + orchestrator -c topology + -i not given, implicitly assumed local hostname + + Instance must be already known to orchestrator. Topology is generated by orchestrator's mapping + and not from synchronuous investigation of the instances. The generated topology may include + instances that are dead, or whose replication is broken. + ` + CommandHelp["all-instances"] = ` + List the complete known set of instances. Similar to '-c find -pattern "."' Example: + + orchestrator -c all-instances + ` + CommandHelp["which-instance"] = ` + Output the fully-qualified hostname:port representation of the given instance, or error if unknown + to orchestrator. Examples: + + orchestrator -c which-instance -i instance.to.check.com + + orchestrator -c which-instance + -i not given, implicitly assumed local hostname + ` + CommandHelp["which-cluster"] = ` + Output the name of the cluster an instance belongs to, or error if unknown to orchestrator. Examples: + + orchestrator -c which-cluster -i instance.to.check.com + + orchestrator -c which-cluster + -i not given, implicitly assumed local hostname + ` + CommandHelp["which-cluster-instances"] = ` + Output the list of instances participating in same cluster as given instance; output is one line + per instance, in hostname:port format. Examples: + + orchestrator -c which-cluster-instances -i instance.to.check.com + + orchestrator -c which-cluster-instances + -i not given, implicitly assumed local hostname + + orchestrator -c which-cluster-instances -alias some_alias + assuming some_alias is a known cluster alias (see ClusterNameToAlias or DetectClusterAliasQuery configuration) + ` + CommandHelp["which-cluster-domain"] = ` + Output the domain name of given cluster, indicated by instance or alias. This depends on + the DetectClusterDomainQuery configuration. Example: + + orchestrator -c which-cluster-domain -i instance.to.check.com + + orchestrator -c which-cluster-domain + -i not given, implicitly assumed local hostname + + orchestrator -c which-cluster-domain -alias some_alias + assuming some_alias is a known cluster alias (see ClusterNameToAlias or DetectClusterAliasQuery configuration) + ` + CommandHelp["which-heuristic-domain-instance"] = ` + Returns the instance associated as the cluster's writer with a cluster's domain name. + Given a cluster, orchestrator looks for the domain name indicated by this cluster, and proceeds to search for + a stord key-value attribute for that domain name. This would be the writer host for the given domain. + See also set-heuristic-domain-instance, this is meant to be a temporary service mimicking in micro-scale a + service discovery functionality. + Example: + + orchestrator -c which-heuristic-domain-instance -alias some_alias + Detects the domain name for given cluster, reads from key-value store the writer host associated with the domain name. + + orchestrator -c which-heuristic-domain-instance -i instance.of.some.cluster + Cluster is inferred by a member instance (the instance is not necessarily the master) + ` + CommandHelp["which-cluster-master"] = ` + Output the name of the active master in a given cluster, indicated by instance or alias. + An "active" master is one that is writable and is not marked as downtimed due to a topology recovery. + Examples: + + orchestrator -c which-cluster-master -i instance.to.check.com + + orchestrator -c which-cluster-master + -i not given, implicitly assumed local hostname + + orchestrator -c which-cluster-master -alias some_alias + assuming some_alias is a known cluster alias (see ClusterNameToAlias or DetectClusterAliasQuery configuration) + ` + CommandHelp["which-cluster-osc-replicas"] = ` + Output a list of replicas in same cluster as given instance, that would server as good candidates as control replicas + for a pt-online-schema-change operation. + Those replicas would be used for replication delay so as to throtthe osc operation. Selected replicas will include, + where possible: intermediate masters, their replicas, 3rd level replicas, direct non-intermediate-master replicas. + + orchestrator -c which-cluster-osc-replicas -i instance.to.check.com + + orchestrator -c which-cluster-osc-replicas + -i not given, implicitly assumed local hostname + + orchestrator -c which-cluster-osc-replicas -alias some_alias + assuming some_alias is a known cluster alias (see ClusterNameToAlias or DetectClusterAliasQuery configuration) + ` + CommandHelp["which-lost-in-recovery"] = ` + List instances marked as downtimed for being lost in a recovery process. The output of this command lists + "lost" instances that probably should be recycled. + The topology recovery process injects a magic hint when downtiming lost instances, that is picked up + by this command. Examples: + + orchestrator -c which-lost-in-recovery + Lists all heuristically-recent known lost instances + ` + CommandHelp["which-master"] = ` + Output the fully-qualified hostname:port representation of a given instance's master. Examples: + + orchestrator -c which-master -i a.known.replica.com + + orchestrator -c which-master + -i not given, implicitly assumed local hostname + ` + CommandHelp["which-replicas"] = ` + Output the fully-qualified hostname:port list of replicas (one per line) of a given instance (or empty + list if instance is not a master to anyone). Examples: + + orchestrator -c which-replicas -i a.known.instance.com + + orchestrator -c which-replicas + -i not given, implicitly assumed local hostname + ` + CommandHelp["get-cluster-heuristic-lag"] = ` + For a given cluster (indicated by an instance or alias), output a heuristic "representative" lag of that cluster. + The output is obtained by examining the replicas that are member of "which-cluster-osc-replicas"-command, and + getting the maximum replica lag of those replicas. Recall that those replicas are a subset of the entire cluster, + and that they are ebing polled periodically. Hence the output of this command is not necessarily up-to-date + and does not represent all replicas in cluster. Examples: + + orchestrator -c get-cluster-heuristic-lag -i instance.that.is.part.of.cluster.com + + orchestrator -c get-cluster-heuristic-lag + -i not given, implicitly assumed local host, cluster implied + + orchestrator -c get-cluster-heuristic-lag -alias some_alias + assuming some_alias is a known cluster alias (see ClusterNameToAlias or DetectClusterAliasQuery configuration) + ` + CommandHelp["instance-status"] = ` + Output short status on a given instance (name, replication status, noteable configuration). Example2: + + orchestrator -c instance-status -i instance.to.investigate.com + + orchestrator -c instance-status + -i not given, implicitly assumed local hostname + ` + CommandHelp["snapshot-topologies"] = ` + Take a snapshot of existing topologies. This will record minimal replication topology data: the identity + of an instance, its master and its cluster. + Taking a snapshot later allows for reviewing changes in topologies. One might wish to invoke this command + on a daily basis, and later be able to solve questions like 'where was this instacne replicating from before + we moved it?', 'which instances were replication from this instance a week ago?' etc. Example: + + orchestrator -c snapshot-topologies + ` + + CommandHelp["discover"] = ` + Request that orchestrator cotacts given instance, reads its status, and upsert it into + orchestrator's respository. Examples: + + orchestrator -c discover -i instance.to.discover.com:3306 + + orchestrator -c discover -i cname.of.instance + + orchestrator -c discover + -i not given, implicitly assumed local hostname + + Orchestrator will resolve CNAMEs and VIPs. + ` + CommandHelp["forget"] = ` + Request that orchestrator removed given instance from its repository. If the instance is alive + and connected through replication to otherwise known and live instances, orchestrator will + re-discover it by nature of its discovery process. Instances are auto-removed via config's + UnseenAgentForgetHours. If you happen to know a machine is decommisioned, for example, it + can be nice to remove it from the repository before it auto-expires. Example: + + orchestrator -c forget -i instance.to.forget.com + + Orchestrator will *not* resolve CNAMEs and VIPs for given instance. + ` + CommandHelp["begin-maintenance"] = ` + Request a maintenance lock on an instance. Topology changes require placing locks on the minimal set of + affected instances, so as to avoid an incident of two uncoordinated operations on a smae instance (leading + to possible chaos). Locks are placed in the backend database, and so multiple orchestrator instances are safe. + Operations automatically acquire locks and release them. This command manually acquires a lock, and will + block other operations on the instance until lock is released. + Note that orchestrator automatically assumes locks to be expired after MaintenanceExpireMinutes (hard coded value). + Examples: + + orchestrator -c begin-maintenance -i instance.to.lock.com --duration=3h --reason="load testing; do not disturb" + accepted duration format: 10s, 30m, 24h, 3d, 4w + + orchestrator -c begin-maintenance -i instance.to.lock.com --reason="load testing; do not disturb" + --duration not given; default to MaintenanceExpireMinutes (hard coded value) + ` + CommandHelp["end-maintenance"] = ` + Remove maintenance lock; such lock may have been gained by an explicit begin-maintenance command implicitly + by a topology change. You should generally only remove locks you have placed manually; orchestrator will + automatically expire locks after MaintenanceExpireMinutes (hard coded value). + Example: + + orchestrator -c end-maintenance -i locked.instance.com + ` + CommandHelp["begin-downtime"] = ` + Mark an instance as downtimed. A downtimed instance is assumed to be taken care of, and recovery-analysis does + not apply for such an instance. As result, no recommendation for recovery, and no automated-recovery are issued + on a downtimed instance. + Downtime is different than maintanence in that it places no lock (mainenance uses an exclusive lock on the instance). + It is OK to downtime an instance that is already downtimed -- the new begin-downtime command will override whatever + previous downtime attributes there were on downtimes instance. + Note that orchestrator automatically assumes downtime to be expired after MaintenanceExpireMinutes (hard coded value). + Examples: + + orchestrator -c begin-downtime -i instance.to.downtime.com --duration=3h --reason="dba handling; do not do recovery" + accepted duration format: 10s, 30m, 24h, 3d, 4w + + orchestrator -c begin-downtime -i instance.to.lock.com --reason="dba handling; do not do recovery" + --duration not given; default to MaintenanceExpireMinutes (hard coded value) + ` + CommandHelp["end-downtime"] = ` + Indicate an instance is no longer downtimed. Typically you should not need to use this since + a downtime is always bounded by a duration and auto-expires. But you may use this to forcibly + indicate the active downtime should be expired now. + Example: + + orchestrator -c end-downtime -i downtimed.instance.com + ` + + CommandHelp["recover"] = ` + Do auto-recovery given a dead instance. Orchestrator chooses the best course of action. + The given instance must be acknowledged as dead and have replicas, or else there's nothing to do. + See "replication-analysis" command. + Orchestrator executes external processes as configured by *Processes variables. + --debug is your friend. Example: + + orchestrator -c recover -i dead.instance.com --debug + ` + CommandHelp["recover-lite"] = ` + Do auto-recovery given a dead instance. Orchestrator chooses the best course of action, exactly + as in "-c recover". Orchestratir will *not* execute external processes. + + orchestrator -c recover-lite -i dead.instance.com --debug + ` + CommandHelp["force-master-failover"] = ` + Forcibly begin a master failover process, even if orchestrator does not see anything wrong + in particular with the master. + - This will not work in a master-master configuration + - Orchestrator just treats this command as a DeadMaster failover scenario + - Orchestrator will issue all relevant pre-failover and post-failover external processes. + - Orchestrator will not attempt to recover/reconnect the old master + ` + CommandHelp["force-master-takeover"] = ` + Forcibly discard master and promote another (direct child) instance instead, even if everything is running well. + This allows for planned switchover. + NOTE: + - You must specify the instance to promote via "-d" + - Promoted instance must be a direct child of the existing master + - This will not work in a master-master configuration + - Orchestrator just treats this command as a DeadMaster failover scenario + - It is STRONGLY suggested that you first relocate everything below your chosen instance-to-promote. + It *is* a planned failover thing. + - Otherwise orchestrator will do its thing in moving instances around, hopefully promoting your requested + server on top. + - Orchestrator will issue all relevant pre-failover and post-failover external processes. + - In this command orchestrator will not issue 'SET GLOBAL read_only=1' on the existing master, nor will + it issue a 'FLUSH TABLES WITH READ LOCK'. Please see the 'graceful-master-takeover' command. + Examples: + + orchestrator -c force-master-takeover -alias mycluster -d immediate.child.of.master.com + Indicate cluster by alias. Orchestrator automatically figures out the master + + orchestrator -c force-master-takeover -i instance.in.relevant.cluster.com -d immediate.child.of.master.com + Indicate cluster by an instance. You don't structly need to specify the master, orchestrator + will infer the master's identify. + ` + CommandHelp["graceful-master-takeover"] = ` + Gracefully discard master and promote another (direct child) instance instead, even if everything is running well. + This allows for planned switchover. + NOTE: + - Promoted instance must be a direct child of the existing master + - Promoted instance must be the *only* direct child of the existing master. It *is* a planned failover thing. + - Orchestrator will first issue a "set global read_only=1" on existing master + - It will promote candidate master to the binlog positions of the existing master after issuing the above + - There _could_ still be statements issued and executed on the existing master by SUPER users, but those are ignored. + - Orchestrator then proceeds to handle a DeadMaster failover scenario + - Orchestrator will issue all relevant pre-failover and post-failover external processes. + Examples: + + orchestrator -c graceful-master-takeover -alias mycluster + Indicate cluster by alias. Orchestrator automatically figures out the master and verifies it has a single direct replica + + orchestrator -c force-master-takeover -i instance.in.relevant.cluster.com + Indicate cluster by an instance. You don't structly need to specify the master, orchestrator + will infer the master's identify. + ` + CommandHelp["replication-analysis"] = ` + Request an analysis of potential crash incidents in all known topologies. + Output format is not yet stabilized and may change in the future. Do not trust the output + for automated parsing. Use web API instead, at this time. Example: + + orchestrator -c replication-analysis + ` + CommandHelp["ack-cluster-recoveries"] = ` + Acknowledge recoveries for a given cluster; this unblocks pending future recoveries. + Acknowledging a recovery requires a comment (supply via --reason). Acknowledgement clears the in-active-period + flag for affected recoveries, which in turn affects any blocking recoveries. + Multiple recoveries may be affected. Only unacknowledged recoveries will be affected. + Examples: + + orchestrator -c ack-cluster-recoveries -i instance.in.a.cluster.com --reason="dba has taken taken necessary steps" + Cluster is indicated by any of its members. The recovery need not necessarily be on/to given instance. + + orchestrator -c ack-cluster-recoveries -alias some_alias --reason="dba has taken taken necessary steps" + Cluster indicated by alias + ` + CommandHelp["ack-instance-recoveries"] = ` + Acknowledge recoveries for a given instance; this unblocks pending future recoveries. + Acknowledging a recovery requires a comment (supply via --reason). Acknowledgement clears the in-active-period + flag for affected recoveries, which in turn affects any blocking recoveries. + Multiple recoveries may be affected. Only unacknowledged recoveries will be affected. + Example: + + orchestrator -c ack-cluster-recoveries -i instance.that.failed.com --reason="dba has taken taken necessary steps" + ` + + CommandHelp["register-candidate"] = ` + Indicate that a specific instance is a preferred candidate for master promotion. Upon a dead master + recovery, orchestrator will do its best to promote instances that are marked as candidates. However + orchestrator cannot guarantee this will always work. Issues like version compatabilities, binlog format + etc. are limiting factors. + You will want to mark an instance as a candidate when: it is replicating directly from the master, has + binary logs and log_slave_updates is enabled, uses same binlog_format as its siblings, compatible version + as its siblings. If you're using DataCenterPattern & PhysicalEnvironmentPattern (see configuration), + you would further wish to make sure you have a candidate in each data center. + Orchestrator first promotes the best-possible replica, and only then replaces it with your candidate, + and only if both in same datcenter and physical enviroment. + An instance needs to continuously be marked as candidate, so as to make sure orchestrator is not wasting + time with stale instances. Orchestrator periodically clears candidate-registration for instances that have + not been registeres for over CandidateInstanceExpireMinutes (see config). + Example: + + orchestrator -c register-candidate -i candidate.instance.com + + orchestrator -c register-candidate + -i not given, implicitly assumed local hostname + ` + CommandHelp["register-hostname-unresolve"] = ` + Assigns the given instance a virtual (aka "unresolved") name. When moving replicas under an instance with assigned + "unresolve" name, orchestrator issues a CHANGE MASTER TO MASTER_HOST='' ... + This is useful in cases where your master is behind virtual IP (e.g. active/passive masters with shared storage or DRBD, + e.g. binlog servers sharing common VIP). + A "repoint" command is useful after "register-hostname-unresolve": you can repoint replicas of the instance to their exact + same location, and orchestrator will swap the fqdn of their master with the unresolved name. + Such registration must be periodic. Orchestrator automatically expires such registration after ExpiryHostnameResolvesMinutes. + Example: + + orchestrator -c register-hostname-unresolve -i instance.fqdn.com --hostname=virtual.name.com + ` + CommandHelp["deregister-hostname-unresolve"] = ` + Explicitly deregister/dosassociate a hostname with an "unresolved" name. Orchestrator merely remvoes the association, but does + not touch any replica at this point. A "repoint" command can be useful right after calling this command to change replica's master host + name (assumed to be an "unresolved" name, such as a VIP) with the real fqdn of the master host. + Example: + + orchestrator -c deregister-hostname-unresolve -i instance.fqdn.com + ` + CommandHelp["set-heuristic-domain-instance"] = ` + This is a temporary (sync your watches, watch for next ice age) command which registers the cluster domain name of a given cluster + with the master/writer host for that cluster. It is a one-time-master-discovery operation. + At this time orchestrator may also act as a small & simple key-value store (recall the "temporary" indication). + Master failover operations will overwrite the domain instance identity. Orchestrator so turns into a mini master-discovery + service (I said "TEMPORARY"). Really there are other tools for the job. See also: which-heuristic-domain-instance + Example: + + orchestrator -c set-heuristic-domain-instance --alias some_alias + Detects the domain name for given cluster, identifies the writer master of the cluster, associates the two in key-value store + + orchestrator -c set-heuristic-domain-instance -i instance.of.some.cluster + Cluster is inferred by a member instance (the instance is not necessarily the master) + ` + + CommandHelp["continuous"] = ` + Enter continuous mode, and actively poll for instances, diagnose problems, do maintenance etc. + This type of work is typically done in HTTP mode. However nothing prevents orchestrator from + doing it in command line. Invoking with "continuous" will run indefinitely. Example: + + orchestrator -c continuous + ` + CommandHelp["active-nodes"] = ` + List orchestrator nodes or processes that are actively running or have most recently + executed. Output is in hostname:token format, where "token" is an internal unique identifier + of an orchestrator process. Example: + + orchestrator -c active-nodes + ` + CommandHelp["access-token"] = ` + When running HTTP with "AuthenticationMethod" : "token", receive a new access token. + This token must be utilized within "AccessTokenUseExpirySeconds" and can then be used + until "AccessTokenExpiryMinutes" have passed. + In "token" authentication method a user is read-only unless able to provide with a fresh token. + A token may only be used once (two users must get two distinct tokens). + Submitting a token is done via "/web/access-token?publicToken=". The token is then stored + in HTTP cookie. + + orchestrator -c access-token + ` + CommandHelp["reset-hostname-resolve-cache"] = ` + Clear the hostname resolve cache; it will be refilled by following host discoveries + + orchestrator -c reset-hostname-resolve-cache + ` + CommandHelp["resolve"] = ` + Utility command to resolve a CNAME and return resolved hostname name. Example: + + orchestrator -c resolve -i cname.to.resolve + ` + CommandHelp["redeploy-internal-db"] = ` + Force internal schema migration to current backend structure. Orchestrator keeps track of the deployed + versions and will not reissue a migration for a version already deployed. Normally you should not use + this command, and it is provided mostly for building and testing purposes. Nonetheless it is safe to + use and at most it wastes some cycles. + ` + + for key := range CommandHelp { + CommandHelp[key] = strings.Trim(CommandHelp[key], "\n") + } +} + +func HelpCommand(command string) { + fmt.Println( + fmt.Sprintf("%s:\n%s", command, CommandHelp[command])) +} diff --git a/go/vt/orchestrator/app/http.go b/go/vt/orchestrator/app/http.go new file mode 100644 index 0000000000..21693fe2de --- /dev/null +++ b/go/vt/orchestrator/app/http.go @@ -0,0 +1,209 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package app + +import ( + "net" + nethttp "net/http" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/agent" + "vitess.io/vitess/go/vt/orchestrator/collection" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/http" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/logic" + "vitess.io/vitess/go/vt/orchestrator/process" + "vitess.io/vitess/go/vt/orchestrator/ssl" + + "github.com/go-martini/martini" + "github.com/martini-contrib/auth" + "github.com/martini-contrib/gzip" + "github.com/martini-contrib/render" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +const discoveryMetricsName = "DISCOVERY_METRICS" + +var sslPEMPassword []byte +var agentSSLPEMPassword []byte +var discoveryMetrics *collection.Collection + +// Http starts serving +func Http(continuousDiscovery bool) { + promptForSSLPasswords() + process.ContinuousRegistration(process.OrchestratorExecutionHttpMode, "") + + martini.Env = martini.Prod + if config.Config.ServeAgentsHttp { + go agentsHttp() + } + standardHttp(continuousDiscovery) +} + +// Iterate over the private keys and get passwords for them +// Don't prompt for a password a second time if the files are the same +func promptForSSLPasswords() { + if ssl.IsEncryptedPEM(config.Config.SSLPrivateKeyFile) { + sslPEMPassword = ssl.GetPEMPassword(config.Config.SSLPrivateKeyFile) + } + if ssl.IsEncryptedPEM(config.Config.AgentSSLPrivateKeyFile) { + if config.Config.AgentSSLPrivateKeyFile == config.Config.SSLPrivateKeyFile { + agentSSLPEMPassword = sslPEMPassword + } else { + agentSSLPEMPassword = ssl.GetPEMPassword(config.Config.AgentSSLPrivateKeyFile) + } + } +} + +// standardHttp starts serving HTTP or HTTPS (api/web) requests, to be used by normal clients +func standardHttp(continuousDiscovery bool) { + m := martini.Classic() + + switch strings.ToLower(config.Config.AuthenticationMethod) { + case "basic": + { + if config.Config.HTTPAuthUser == "" { + // Still allowed; may be disallowed in future versions + log.Warning("AuthenticationMethod is configured as 'basic' but HTTPAuthUser undefined. Running without authentication.") + } + m.Use(auth.Basic(config.Config.HTTPAuthUser, config.Config.HTTPAuthPassword)) + } + case "multi": + { + if config.Config.HTTPAuthUser == "" { + // Still allowed; may be disallowed in future versions + log.Fatal("AuthenticationMethod is configured as 'multi' but HTTPAuthUser undefined") + } + + m.Use(auth.BasicFunc(func(username, password string) bool { + if username == "readonly" { + // Will be treated as "read-only" + return true + } + return auth.SecureCompare(username, config.Config.HTTPAuthUser) && auth.SecureCompare(password, config.Config.HTTPAuthPassword) + })) + } + default: + { + // We inject a dummy User object because we have function signatures with User argument in api.go + m.Map(auth.User("")) + } + } + + m.Use(gzip.All()) + // Render html templates from templates directory + m.Use(render.Renderer(render.Options{ + Directory: "resources", + Layout: "templates/layout", + HTMLContentType: "text/html", + })) + m.Use(martini.Static("resources/public", martini.StaticOptions{Prefix: config.Config.URLPrefix})) + if config.Config.UseMutualTLS { + m.Use(ssl.VerifyOUs(config.Config.SSLValidOUs)) + } + + inst.SetMaintenanceOwner(process.ThisHostname) + + if continuousDiscovery { + // start to expire metric collection info + discoveryMetrics = collection.CreateOrReturnCollection(discoveryMetricsName) + discoveryMetrics.SetExpirePeriod(time.Duration(config.Config.DiscoveryCollectionRetentionSeconds) * time.Second) + + log.Info("Starting Discovery") + go logic.ContinuousDiscovery() + } + + log.Info("Registering endpoints") + http.API.URLPrefix = config.Config.URLPrefix + http.Web.URLPrefix = config.Config.URLPrefix + http.API.RegisterRequests(m) + http.Web.RegisterRequests(m) + + // Serve + if config.Config.ListenSocket != "" { + log.Infof("Starting HTTP listener on unix socket %v", config.Config.ListenSocket) + unixListener, err := net.Listen("unix", config.Config.ListenSocket) + if err != nil { + log.Fatale(err) + } + defer unixListener.Close() + if err := nethttp.Serve(unixListener, m); err != nil { + log.Fatale(err) + } + } else if config.Config.UseSSL { + log.Info("Starting HTTPS listener") + tlsConfig, err := ssl.NewTLSConfig(config.Config.SSLCAFile, config.Config.UseMutualTLS) + if err != nil { + log.Fatale(err) + } + tlsConfig.InsecureSkipVerify = config.Config.SSLSkipVerify + if err = ssl.AppendKeyPairWithPassword(tlsConfig, config.Config.SSLCertFile, config.Config.SSLPrivateKeyFile, sslPEMPassword); err != nil { + log.Fatale(err) + } + if err = ssl.ListenAndServeTLS(config.Config.ListenAddress, m, tlsConfig); err != nil { + log.Fatale(err) + } + } else { + log.Infof("Starting HTTP listener on %+v", config.Config.ListenAddress) + if err := nethttp.ListenAndServe(config.Config.ListenAddress, m); err != nil { + log.Fatale(err) + } + } + log.Info("Web server started") +} + +// agentsHttp startes serving agents HTTP or HTTPS API requests +func agentsHttp() { + m := martini.Classic() + m.Use(gzip.All()) + m.Use(render.Renderer()) + if config.Config.AgentsUseMutualTLS { + m.Use(ssl.VerifyOUs(config.Config.AgentSSLValidOUs)) + } + + log.Info("Starting agents listener") + + agent.InitHttpClient() + go logic.ContinuousAgentsPoll() + + http.AgentsAPI.URLPrefix = config.Config.URLPrefix + http.AgentsAPI.RegisterRequests(m) + + // Serve + if config.Config.AgentsUseSSL { + log.Info("Starting agent HTTPS listener") + tlsConfig, err := ssl.NewTLSConfig(config.Config.AgentSSLCAFile, config.Config.AgentsUseMutualTLS) + if err != nil { + log.Fatale(err) + } + tlsConfig.InsecureSkipVerify = config.Config.AgentSSLSkipVerify + if err = ssl.AppendKeyPairWithPassword(tlsConfig, config.Config.AgentSSLCertFile, config.Config.AgentSSLPrivateKeyFile, agentSSLPEMPassword); err != nil { + log.Fatale(err) + } + if err = ssl.ListenAndServeTLS(config.Config.AgentsServerPort, m, tlsConfig); err != nil { + log.Fatale(err) + } + } else { + log.Info("Starting agent HTTP listener") + if err := nethttp.ListenAndServe(config.Config.AgentsServerPort, m); err != nil { + log.Fatale(err) + } + } + log.Info("Agent server started") +} diff --git a/go/vt/orchestrator/attributes/attributes.go b/go/vt/orchestrator/attributes/attributes.go new file mode 100644 index 0000000000..db8015be42 --- /dev/null +++ b/go/vt/orchestrator/attributes/attributes.go @@ -0,0 +1,26 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package attributes + +// HostAttributes presnts attributes submitted by a host +type HostAttributes struct { + Hostname string + AttributeName string + AttributeValue string + SubmitTimestamp string + ExpireTimestamp string +} diff --git a/go/vt/orchestrator/attributes/attributes_dao.go b/go/vt/orchestrator/attributes/attributes_dao.go new file mode 100644 index 0000000000..2b49b8cc81 --- /dev/null +++ b/go/vt/orchestrator/attributes/attributes_dao.go @@ -0,0 +1,164 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package attributes + +import ( + "errors" + "fmt" + "io/ioutil" + "net/http" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +func readResponse(res *http.Response, err error) ([]byte, error) { + if err != nil { + return nil, err + } + + defer res.Body.Close() + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return nil, err + } + + if res.Status == "500" { + return body, errors.New("Response Status 500") + } + + return body, nil +} + +// SetHostAttributes +func SetHostAttributes(hostname string, attributeName string, attributeValue string) error { + _, err := db.ExecOrchestrator(` + replace + into host_attributes ( + hostname, attribute_name, attribute_value, submit_timestamp, expire_timestamp + ) VALUES ( + ?, ?, ?, NOW(), NULL + ) + `, + hostname, + attributeName, + attributeValue, + ) + if err != nil { + return log.Errore(err) + } + + return err +} + +func getHostAttributesByClause(whereClause string, args []interface{}) ([]HostAttributes, error) { + res := []HostAttributes{} + query := fmt.Sprintf(` + select + hostname, + attribute_name, + attribute_value, + submit_timestamp , + ifnull(expire_timestamp, '') as expire_timestamp + from + host_attributes + %s + order by + hostname, attribute_name + `, whereClause) + + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + hostAttributes := HostAttributes{} + hostAttributes.Hostname = m.GetString("hostname") + hostAttributes.AttributeName = m.GetString("attribute_name") + hostAttributes.AttributeValue = m.GetString("attribute_value") + hostAttributes.SubmitTimestamp = m.GetString("submit_timestamp") + hostAttributes.ExpireTimestamp = m.GetString("expire_timestamp") + + res = append(res, hostAttributes) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// GetHostAttributesByMatch +func GetHostAttributesByMatch(hostnameMatch string, attributeNameMatch string, attributeValueMatch string) ([]HostAttributes, error) { + terms := []string{} + args := sqlutils.Args() + if hostnameMatch != "" { + terms = append(terms, ` hostname rlike ? `) + args = append(args, hostnameMatch) + } + if attributeNameMatch != "" { + terms = append(terms, ` attribute_name rlike ? `) + args = append(args, attributeNameMatch) + } + if attributeValueMatch != "" { + terms = append(terms, ` attribute_value rlike ? `) + args = append(args, attributeValueMatch) + } + + if len(terms) == 0 { + return getHostAttributesByClause("", args) + } + whereCondition := fmt.Sprintf(" where %s ", strings.Join(terms, " and ")) + + return getHostAttributesByClause(whereCondition, args) +} + +// GetHostAttribute expects to return a single attribute for a given hostname/attribute-name combination +// or error on empty result +func GetHostAttribute(hostname string, attributeName string) (string, error) { + whereClause := `where hostname=? and attribute_name=?` + attributes, err := getHostAttributesByClause(whereClause, sqlutils.Args(hostname, attributeName)) + if err != nil { + return "", err + } + if len(attributeName) == 0 { + return "", log.Errorf("No attribute found for %+v, %+v", hostname, attributeName) + } + return attributes[0].AttributeValue, nil +} + +// SetGeneralAttribute sets an attribute not associated with a host. Its a key-value thing +func SetGeneralAttribute(attributeName string, attributeValue string) error { + if attributeName == "" { + return nil + } + return SetHostAttributes("*", attributeName, attributeValue) +} + +// GetGeneralAttribute expects to return a single attribute value (not associated with a specific hostname) +func GetGeneralAttribute(attributeName string) (result string, err error) { + return GetHostAttribute("*", attributeName) +} + +// GetHostAttributesByAttribute +func GetHostAttributesByAttribute(attributeName string, valueMatch string) ([]HostAttributes, error) { + if valueMatch == "" { + valueMatch = ".?" + } + whereClause := ` where attribute_name = ? and attribute_value rlike ?` + + return getHostAttributesByClause(whereClause, sqlutils.Args(attributeName, valueMatch)) +} diff --git a/go/vt/orchestrator/collection/collection.go b/go/vt/orchestrator/collection/collection.go new file mode 100644 index 0000000000..599e67ff9c --- /dev/null +++ b/go/vt/orchestrator/collection/collection.go @@ -0,0 +1,292 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/* + +Package collection holds routines for collecting "high frequency" +metrics and handling their auto-expiry based on a configured retention +time. This becomes more interesting as the number of MySQL servers +monitored by orchestrator increases. + +Most monitoring systems look at different metrics over a period +like 1, 10, 30 or 60 seconds but even at second resolution orchestrator +may have polled a number of servers. + +It can be helpful to collect the raw values, and then allow external +monitoring to pull via an http api call either pre-cooked aggregate +data or the raw data for custom analysis over the period requested. + +This is expected to be used for the following types of metric: + +* discovery metrics (time to poll a MySQL server and collect status) +* queue metrics (statistics within the discovery queue itself) +* query metrics (statistics on the number of queries made to the + backend MySQL database) + +Orchestrator code can just add a new metric without worrying about +removing it later, and other code which serves API requests can +pull out the data when needed for the requested time period. + +For current metrics two api urls have been provided: one provides +the raw data and the other one provides a single set of aggregate +data which is suitable for easy collection by monitoring systems. + +Expiry is triggered by default if the collection is created via +CreateOrReturnCollection() and uses an expiry period of +DiscoveryCollectionRetentionSeconds. It can also be enabled by +calling StartAutoExpiration() after setting the required expire +period with SetExpirePeriod(). + +This will trigger periodic calls (every second) to ensure the removal +of metrics which have passed the time specified. Not enabling expiry +will mean data is collected but never freed which will make +orchestrator run out of memory eventually. + +Current code uses DiscoveryCollectionRetentionSeconds as the +time to keep metric data. + +*/ +package collection + +import ( + "errors" + "sync" + "time" + + // "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + + "vitess.io/vitess/go/vt/orchestrator/config" +) + +// Metric is an interface containing a metric +type Metric interface { + When() time.Time // when the metric was taken +} + +// Collection contains a collection of Metrics +type Collection struct { + sync.Mutex // for locking the structure + monitoring bool // am I monitoring the queue size? + collection []Metric + done chan struct{} // to indicate that we are finishing expiry processing + expirePeriod time.Duration // time to keep the collection information for +} + +// hard-coded at every second +const defaultExpireTickerPeriod = time.Second + +// backendMetricCollection contains the last N backend "channelled" +// metrics which can then be accessed via an API call for monitoring. +var ( + namedCollection map[string](*Collection) + namedCollectionLock sync.Mutex +) + +func init() { + namedCollection = make(map[string](*Collection)) +} + +// StopMonitoring stops monitoring all the collections +func StopMonitoring() { + for _, q := range namedCollection { + q.StopAutoExpiration() + } +} + +// CreateOrReturnCollection allows for creation of a new collection or +// returning a pointer to an existing one given the name. This allows access +// to the data structure from the api interface (http/api.go) and also when writing (inst). +func CreateOrReturnCollection(name string) *Collection { + namedCollectionLock.Lock() + defer namedCollectionLock.Unlock() + if q, found := namedCollection[name]; found { + return q + } + + qmc := &Collection{ + collection: nil, + done: make(chan struct{}), + // WARNING: use a different configuration name + expirePeriod: time.Duration(config.Config.DiscoveryCollectionRetentionSeconds) * time.Second, + } + go qmc.StartAutoExpiration() + + namedCollection[name] = qmc + + return qmc +} + +// SetExpirePeriod determines after how long the collected data should be removed +func (c *Collection) SetExpirePeriod(duration time.Duration) { + c.Lock() + defer c.Unlock() + + c.expirePeriod = duration +} + +// ExpirePeriod returns the currently configured expiration period +func (c *Collection) ExpirePeriod() time.Duration { + c.Lock() + defer c.Unlock() + return c.expirePeriod +} + +// StopAutoExpiration prepares to stop by terminating the auto-expiration process +func (c *Collection) StopAutoExpiration() { + if c == nil { + return + } + c.Lock() + if !c.monitoring { + c.Unlock() + return + } + c.monitoring = false + c.Unlock() + + // no locking here deliberately + c.done <- struct{}{} +} + +// StartAutoExpiration initiates the auto expiry procedure which +// periodically checks for metrics in the collection which need to +// be expired according to bc.ExpirePeriod. +func (c *Collection) StartAutoExpiration() { + if c == nil { + return + } + c.Lock() + if c.monitoring { + c.Unlock() + return + } + c.monitoring = true + c.Unlock() + + // log.Infof("StartAutoExpiration: %p with expirePeriod: %v", c, c.expirePeriod) + ticker := time.NewTicker(defaultExpireTickerPeriod) + + for { + select { + case <-ticker.C: // do the periodic expiry + c.removeBefore(time.Now().Add(-c.expirePeriod)) + case <-c.done: // stop the ticker and return + ticker.Stop() + return + } + } +} + +// Metrics returns a slice containing all the metric values +func (c *Collection) Metrics() []Metric { + if c == nil { + return nil + } + c.Lock() + defer c.Unlock() + + if len(c.collection) == 0 { + return nil // nothing to return + } + return c.collection +} + +// Since returns the Metrics on or after the given time. We assume +// the metrics are stored in ascending time. +// Iterate backwards until we reach the first value before the given time +// or the end of the array. +func (c *Collection) Since(t time.Time) ([]Metric, error) { + if c == nil { + return nil, errors.New("Collection.Since: c == nil") + } + c.Lock() + defer c.Unlock() + if len(c.collection) == 0 { + return nil, nil // nothing to return + } + last := len(c.collection) + first := last - 1 + + done := false + for !done { + if c.collection[first].When().After(t) || c.collection[first].When().Equal(t) { + if first == 0 { + break // as can't go lower + } + first-- + } else { + if first != last { + first++ // go back one (except if we're already at the end) + } + break + } + } + + return c.collection[first:last], nil +} + +// removeBefore is called by StartAutoExpiration and removes collection values +// before the given time. +func (c *Collection) removeBefore(t time.Time) error { + if c == nil { + return errors.New("Collection.removeBefore: c == nil") + } + c.Lock() + defer c.Unlock() + + cLen := len(c.collection) + if cLen == 0 { + return nil // we have a collection but no data + } + // remove old data here. + first := 0 + done := false + for !done { + if c.collection[first].When().Before(t) { + first++ + if first == cLen { + break + } + } else { + first-- + break + } + } + + // get the interval we need. + if first == len(c.collection) { + c.collection = nil // remove all entries + } else if first != -1 { + c.collection = c.collection[first:] + } + return nil // no errors +} + +// Append a new Metric to the existing collection +func (c *Collection) Append(m Metric) error { + if c == nil { + return errors.New("Collection.Append: c == nil") + } + c.Lock() + defer c.Unlock() + // we don't want to add nil metrics + if c == nil { + return errors.New("Collection.Append: c == nil") + } + c.collection = append(c.collection, m) + + return nil +} diff --git a/go/vt/orchestrator/collection/collection_test.go b/go/vt/orchestrator/collection/collection_test.go new file mode 100644 index 0000000000..d98af7ace0 --- /dev/null +++ b/go/vt/orchestrator/collection/collection_test.go @@ -0,0 +1,104 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package collection + +import ( + "testing" + "time" +) + +var randomString = []string{ + "RANDOM_STRING", + "SOME_OTHER_STRING", +} + +// some random base timestamp +var ts = time.Date(2016, 12, 27, 13, 36, 40, 0, time.Local) + +// TestCreateOrReturn tests the creation of a named Collection +func TestCreateOrReturnCollection(t *testing.T) { + name := randomString[0] + // check we get the same reference with a single name + c1 := CreateOrReturnCollection(name) + if c1 == nil { + // should not be empty + t.Errorf("TestCreateOrReturn: c1 == nil, name=%s", name) + } + c2 := CreateOrReturnCollection(name) + if c2 == nil || c2 != c1 { + t.Errorf("TestCreateOrReturn: c2 == nil || c2 != c1") + // should not be empty, or different to c1 + } + + name = randomString[1] + // check we get a new reference and it's different to what we had before + c3 := CreateOrReturnCollection(name) + if c3 == nil || c3 == c1 { + // should not be empty, or same as c1 + t.Errorf("TestCreateOrReturn: c3 == nil || c3 == c1") + } + c4 := CreateOrReturnCollection(name) + // check our reference matches c3 but not c2/c1 + if c4 == nil || c4 != c3 || c4 == c2 { + t.Errorf("TestCreateOrReturn: c3 == nil || c4 != c3 || c4 == c2") + } +} + +// TestExpirePeriod checks that the set expire period is returned +func TestExpirePeriod(t *testing.T) { + oneSecond := time.Second + twoSeconds := 2 * oneSecond + + // create a new collection + c := &Collection{} + + // check if we change it we get back the value we provided + c.SetExpirePeriod(oneSecond) + if c.ExpirePeriod() != oneSecond { + t.Errorf("TestExpirePeriod: did not get back oneSecond") + } + + // change the period and check again + c.SetExpirePeriod(twoSeconds) + if c.ExpirePeriod() != twoSeconds { + t.Errorf("TestExpirePeriod: did not get back twoSeconds") + } +} + +// dummy structure for testing +type testMetric struct { +} + +func (tm *testMetric) When() time.Time { + return ts +} + +// check that Append() works as expected +func TestAppend(t *testing.T) { + c := &Collection{} + + if len(c.Metrics()) != 0 { + t.Errorf("TestAppend: len(Metrics) = %d, expecting %d", len(c.Metrics()), 0) + } + for _, v := range []int{1, 2, 3} { + tm := &testMetric{} + c.Append(tm) + if len(c.Metrics()) != v { + t.Errorf("TestExpirePeriod: len(Metrics) = %d, expecting %d", len(c.Metrics()), v) + } + } +} diff --git a/go/vt/orchestrator/config/cli_flags.go b/go/vt/orchestrator/config/cli_flags.go new file mode 100644 index 0000000000..d3ded6c2a2 --- /dev/null +++ b/go/vt/orchestrator/config/cli_flags.go @@ -0,0 +1,37 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package config + +// CLIFlags stores some command line flags that are globally available in the process' lifetime +type CLIFlags struct { + Noop *bool + SkipUnresolve *bool + SkipUnresolveCheck *bool + BinlogFile *string + GrabElection *bool + Version *bool + Statement *string + PromotionRule *string + ConfiguredVersion string + SkipBinlogSearch *bool + SkipContinuousRegistration *bool + EnableDatabaseUpdate *bool + IgnoreRaftSetup *bool + Tag *string +} + +var RuntimeCLIFlags CLIFlags diff --git a/go/vt/orchestrator/config/config.go b/go/vt/orchestrator/config/config.go new file mode 100644 index 0000000000..093d1b893b --- /dev/null +++ b/go/vt/orchestrator/config/config.go @@ -0,0 +1,677 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package config + +import ( + "encoding/json" + "fmt" + "net/url" + "os" + "regexp" + "strings" + + "gopkg.in/gcfg.v1" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +var ( + envVariableRegexp = regexp.MustCompile("[$][{](.*)[}]") +) + +const ( + LostInRecoveryDowntimeSeconds int = 60 * 60 * 24 * 365 + DefaultStatusAPIEndpoint = "/api/status" +) + +var configurationLoaded chan bool = make(chan bool) + +const ( + HealthPollSeconds = 1 + RaftHealthPollSeconds = 10 + RecoveryPollSeconds = 1 + ActiveNodeExpireSeconds = 5 + BinlogFileHistoryDays = 1 + MaintenanceOwner = "orchestrator" + AuditPageSize = 20 + MaintenancePurgeDays = 7 + MySQLTopologyMaxPoolConnections = 3 + MaintenanceExpireMinutes = 10 + AgentHttpTimeoutSeconds = 60 + PseudoGTIDCoordinatesHistoryHeuristicMinutes = 2 + DebugMetricsIntervalSeconds = 10 + PseudoGTIDSchema = "_pseudo_gtid_" + PseudoGTIDIntervalSeconds = 5 + PseudoGTIDExpireMinutes = 60 + StaleInstanceCoordinatesExpireSeconds = 60 + CheckAutoPseudoGTIDGrantsIntervalSeconds = 60 + SelectTrueQuery = "select 1" +) + +var deprecatedConfigurationVariables = []string{ + "DatabaselessMode__experimental", + "BufferBinlogEvents", + "BinlogFileHistoryDays", + "MaintenanceOwner", + "ReadLongRunningQueries", + "DiscoveryPollSeconds", + "ActiveNodeExpireSeconds", + "AuditPageSize", + "SlaveStartPostWaitMilliseconds", + "MySQLTopologyMaxPoolConnections", + "MaintenancePurgeDays", + "MaintenanceExpireMinutes", + "HttpTimeoutSeconds", + "AgentAutoDiscover", + "PseudoGTIDCoordinatesHistoryHeuristicMinutes", + "PseudoGTIDPreferIndependentMultiMatch", + "MaxOutdatedKeysToShow", +} + +// Configuration makes for orchestrator configuration input, which can be provided by user via JSON formatted file. +// Some of the parameteres have reasonable default values, and some (like database credentials) are +// strictly expected from user. +type Configuration struct { + Debug bool // set debug mode (similar to --debug option) + EnableSyslog bool // Should logs be directed (in addition) to syslog daemon? + ListenAddress string // Where orchestrator HTTP should listen for TCP + ListenSocket string // Where orchestrator HTTP should listen for unix socket (default: empty; when given, TCP is disabled) + HTTPAdvertise string // optional, for raft setups, what is the HTTP address this node will advertise to its peers (potentially use where behind NAT or when rerouting ports; example: "http://11.22.33.44:3030") + AgentsServerPort string // port orchestrator agents talk back to + MySQLTopologyUser string + MySQLTopologyPassword string + MySQLTopologyCredentialsConfigFile string // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section + MySQLTopologySSLPrivateKeyFile string // Private key file used to authenticate with a Topology mysql instance with TLS + MySQLTopologySSLCertFile string // Certificate PEM file used to authenticate with a Topology mysql instance with TLS + MySQLTopologySSLCAFile string // Certificate Authority PEM file used to authenticate with a Topology mysql instance with TLS + MySQLTopologySSLSkipVerify bool // If true, do not strictly validate mutual TLS certs for Topology mysql instances + MySQLTopologyUseMutualTLS bool // Turn on TLS authentication with the Topology MySQL instances + MySQLTopologyUseMixedTLS bool // Mixed TLS and non-TLS authentication with the Topology MySQL instances + TLSCacheTTLFactor uint // Factor of InstancePollSeconds that we set as TLS info cache expiry + BackendDB string // EXPERIMENTAL: type of backend db; either "mysql" or "sqlite3" + SQLite3DataFile string // when BackendDB == "sqlite3", full path to sqlite3 datafile + SkipOrchestratorDatabaseUpdate bool // When true, do not check backend database schema nor attempt to update it. Useful when you may be running multiple versions of orchestrator, and you only wish certain boxes to dictate the db structure (or else any time a different orchestrator version runs it will rebuild database schema) + PanicIfDifferentDatabaseDeploy bool // When true, and this process finds the orchestrator backend DB was provisioned by a different version, panic + RaftEnabled bool // When true, setup orchestrator in a raft consensus layout. When false (default) all Raft* variables are ignored + RaftBind string + RaftAdvertise string + RaftDataDir string + DefaultRaftPort int // if a RaftNodes entry does not specify port, use this one + RaftNodes []string // Raft nodes to make initial connection with + ExpectFailureAnalysisConcensus bool + MySQLOrchestratorHost string + MySQLOrchestratorMaxPoolConnections int // The maximum size of the connection pool to the Orchestrator backend. + MySQLOrchestratorPort uint + MySQLOrchestratorDatabase string + MySQLOrchestratorUser string + MySQLOrchestratorPassword string + MySQLOrchestratorCredentialsConfigFile string // my.cnf style configuration file from where to pick credentials. Expecting `user`, `password` under `[client]` section + MySQLOrchestratorSSLPrivateKeyFile string // Private key file used to authenticate with the Orchestrator mysql instance with TLS + MySQLOrchestratorSSLCertFile string // Certificate PEM file used to authenticate with the Orchestrator mysql instance with TLS + MySQLOrchestratorSSLCAFile string // Certificate Authority PEM file used to authenticate with the Orchestrator mysql instance with TLS + MySQLOrchestratorSSLSkipVerify bool // If true, do not strictly validate mutual TLS certs for the Orchestrator mysql instances + MySQLOrchestratorUseMutualTLS bool // Turn on TLS authentication with the Orchestrator MySQL instance + MySQLOrchestratorReadTimeoutSeconds int // Number of seconds before backend mysql read operation is aborted (driver-side) + MySQLOrchestratorRejectReadOnly bool // Reject read only connections https://github.com/go-sql-driver/mysql#rejectreadonly + MySQLConnectTimeoutSeconds int // Number of seconds before connection is aborted (driver-side) + MySQLDiscoveryReadTimeoutSeconds int // Number of seconds before topology mysql read operation is aborted (driver-side). Used for discovery queries. + MySQLTopologyReadTimeoutSeconds int // Number of seconds before topology mysql read operation is aborted (driver-side). Used for all but discovery queries. + MySQLConnectionLifetimeSeconds int // Number of seconds the mysql driver will keep database connection alive before recycling it + DefaultInstancePort int // In case port was not specified on command line + SlaveLagQuery string // Synonym to ReplicationLagQuery + ReplicationLagQuery string // custom query to check on replica lg (e.g. heartbeat table). Must return a single row with a single numeric column, which is the lag. + ReplicationCredentialsQuery string // custom query to get replication credentials. Must return a single row, with two text columns: 1st is username, 2nd is password. This is optional, and can be used by orchestrator to configure replication after master takeover or setup of co-masters. You need to ensure the orchestrator user has the privileges to run this query + DiscoverByShowSlaveHosts bool // Attempt SHOW SLAVE HOSTS before PROCESSLIST + UseSuperReadOnly bool // Should orchestrator super_read_only any time it sets read_only + InstancePollSeconds uint // Number of seconds between instance reads + InstanceWriteBufferSize int // Instance write buffer size (max number of instances to flush in one INSERT ODKU) + BufferInstanceWrites bool // Set to 'true' for write-optimization on backend table (compromise: writes can be stale and overwrite non stale data) + InstanceFlushIntervalMilliseconds int // Max interval between instance write buffer flushes + SkipMaxScaleCheck bool // If you don't ever have MaxScale BinlogServer in your topology (and most people don't), set this to 'true' to save some pointless queries + UnseenInstanceForgetHours uint // Number of hours after which an unseen instance is forgotten + SnapshotTopologiesIntervalHours uint // Interval in hour between snapshot-topologies invocation. Default: 0 (disabled) + DiscoveryMaxConcurrency uint // Number of goroutines doing hosts discovery + DiscoveryQueueCapacity uint // Buffer size of the discovery queue. Should be greater than the number of DB instances being discovered + DiscoveryQueueMaxStatisticsSize int // The maximum number of individual secondly statistics taken of the discovery queue + DiscoveryCollectionRetentionSeconds uint // Number of seconds to retain the discovery collection information + DiscoverySeeds []string // Hard coded array of hostname:port, ensuring orchestrator discovers these hosts upon startup, assuming not already known to orchestrator + InstanceBulkOperationsWaitTimeoutSeconds uint // Time to wait on a single instance when doing bulk (many instances) operation + HostnameResolveMethod string // Method by which to "normalize" hostname ("none"/"default"/"cname") + MySQLHostnameResolveMethod string // Method by which to "normalize" hostname via MySQL server. ("none"/"@@hostname"/"@@report_host"; default "@@hostname") + SkipBinlogServerUnresolveCheck bool // Skip the double-check that an unresolved hostname resolves back to same hostname for binlog servers + ExpiryHostnameResolvesMinutes int // Number of minutes after which to expire hostname-resolves + RejectHostnameResolvePattern string // Regexp pattern for resolved hostname that will not be accepted (not cached, not written to db). This is done to avoid storing wrong resolves due to network glitches. + ReasonableReplicationLagSeconds int // Above this value is considered a problem + ProblemIgnoreHostnameFilters []string // Will minimize problem visualization for hostnames matching given regexp filters + VerifyReplicationFilters bool // Include replication filters check before approving topology refactoring + ReasonableMaintenanceReplicationLagSeconds int // Above this value move-up and move-below are blocked + CandidateInstanceExpireMinutes uint // Minutes after which a suggestion to use an instance as a candidate replica (to be preferably promoted on master failover) is expired. + AuditLogFile string // Name of log file for audit operations. Disabled when empty. + AuditToSyslog bool // If true, audit messages are written to syslog + AuditToBackendDB bool // If true, audit messages are written to the backend DB's `audit` table (default: true) + AuditPurgeDays uint // Days after which audit entries are purged from the database + RemoveTextFromHostnameDisplay string // Text to strip off the hostname on cluster/clusters pages + ReadOnly bool + AuthenticationMethod string // Type of autherntication to use, if any. "" for none, "basic" for BasicAuth, "multi" for advanced BasicAuth, "proxy" for forwarded credentials via reverse proxy, "token" for token based access + OAuthClientId string + OAuthClientSecret string + OAuthScopes []string + HTTPAuthUser string // Username for HTTP Basic authentication (blank disables authentication) + HTTPAuthPassword string // Password for HTTP Basic authentication + AuthUserHeader string // HTTP header indicating auth user, when AuthenticationMethod is "proxy" + PowerAuthUsers []string // On AuthenticationMethod == "proxy", list of users that can make changes. All others are read-only. + PowerAuthGroups []string // list of unix groups the authenticated user must be a member of to make changes. + AccessTokenUseExpirySeconds uint // Time by which an issued token must be used + AccessTokenExpiryMinutes uint // Time after which HTTP access token expires + ClusterNameToAlias map[string]string // map between regex matching cluster name to a human friendly alias + DetectClusterAliasQuery string // Optional query (executed on topology instance) that returns the alias of a cluster. Query will only be executed on cluster master (though until the topology's master is resovled it may execute on other/all replicas). If provided, must return one row, one column + DetectClusterDomainQuery string // Optional query (executed on topology instance) that returns the VIP/CNAME/Alias/whatever domain name for the master of this cluster. Query will only be executed on cluster master (though until the topology's master is resovled it may execute on other/all replicas). If provided, must return one row, one column + DetectInstanceAliasQuery string // Optional query (executed on topology instance) that returns the alias of an instance. If provided, must return one row, one column + DetectPromotionRuleQuery string // Optional query (executed on topology instance) that returns the promotion rule of an instance. If provided, must return one row, one column. + DataCenterPattern string // Regexp pattern with one group, extracting the datacenter name from the hostname + RegionPattern string // Regexp pattern with one group, extracting the region name from the hostname + PhysicalEnvironmentPattern string // Regexp pattern with one group, extracting physical environment info from hostname (e.g. combination of datacenter & prod/dev env) + DetectDataCenterQuery string // Optional query (executed on topology instance) that returns the data center of an instance. If provided, must return one row, one column. Overrides DataCenterPattern and useful for installments where DC cannot be inferred by hostname + DetectRegionQuery string // Optional query (executed on topology instance) that returns the region of an instance. If provided, must return one row, one column. Overrides RegionPattern and useful for installments where Region cannot be inferred by hostname + DetectPhysicalEnvironmentQuery string // Optional query (executed on topology instance) that returns the physical environment of an instance. If provided, must return one row, one column. Overrides PhysicalEnvironmentPattern and useful for installments where env cannot be inferred by hostname + DetectSemiSyncEnforcedQuery string // Optional query (executed on topology instance) to determine whether semi-sync is fully enforced for master writes (async fallback is not allowed under any circumstance). If provided, must return one row, one column, value 0 or 1. + SupportFuzzyPoolHostnames bool // Should "submit-pool-instances" command be able to pass list of fuzzy instances (fuzzy means non-fqdn, but unique enough to recognize). Defaults 'true', implies more queries on backend db + InstancePoolExpiryMinutes uint // Time after which entries in database_instance_pool are expired (resubmit via `submit-pool-instances`) + PromotionIgnoreHostnameFilters []string // Orchestrator will not promote replicas with hostname matching pattern (via -c recovery; for example, avoid promoting dev-dedicated machines) + ServeAgentsHttp bool // Spawn another HTTP interface dedicated for orchestrator-agent + AgentsUseSSL bool // When "true" orchestrator will listen on agents port with SSL as well as connect to agents via SSL + AgentsUseMutualTLS bool // When "true" Use mutual TLS for the server to agent communication + AgentSSLSkipVerify bool // When using SSL for the Agent, should we ignore SSL certification error + AgentSSLPrivateKeyFile string // Name of Agent SSL private key file, applies only when AgentsUseSSL = true + AgentSSLCertFile string // Name of Agent SSL certification file, applies only when AgentsUseSSL = true + AgentSSLCAFile string // Name of the Agent Certificate Authority file, applies only when AgentsUseSSL = true + AgentSSLValidOUs []string // Valid organizational units when using mutual TLS to communicate with the agents + UseSSL bool // Use SSL on the server web port + UseMutualTLS bool // When "true" Use mutual TLS for the server's web and API connections + SSLSkipVerify bool // When using SSL, should we ignore SSL certification error + SSLPrivateKeyFile string // Name of SSL private key file, applies only when UseSSL = true + SSLCertFile string // Name of SSL certification file, applies only when UseSSL = true + SSLCAFile string // Name of the Certificate Authority file, applies only when UseSSL = true + SSLValidOUs []string // Valid organizational units when using mutual TLS + StatusEndpoint string // Override the status endpoint. Defaults to '/api/status' + StatusOUVerify bool // If true, try to verify OUs when Mutual TLS is on. Defaults to false + AgentPollMinutes uint // Minutes between agent polling + UnseenAgentForgetHours uint // Number of hours after which an unseen agent is forgotten + StaleSeedFailMinutes uint // Number of minutes after which a stale (no progress) seed is considered failed. + SeedAcceptableBytesDiff int64 // Difference in bytes between seed source & target data size that is still considered as successful copy + SeedWaitSecondsBeforeSend int64 // Number of seconds for waiting before start send data command on agent + AutoPseudoGTID bool // Should orchestrator automatically inject Pseudo-GTID entries to the masters + PseudoGTIDPattern string // Pattern to look for in binary logs that makes for a unique entry (pseudo GTID). When empty, Pseudo-GTID based refactoring is disabled. + PseudoGTIDPatternIsFixedSubstring bool // If true, then PseudoGTIDPattern is not treated as regular expression but as fixed substring, and can boost search time + PseudoGTIDMonotonicHint string // subtring in Pseudo-GTID entry which indicates Pseudo-GTID entries are expected to be monotonically increasing + DetectPseudoGTIDQuery string // Optional query which is used to authoritatively decide whether pseudo gtid is enabled on instance + BinlogEventsChunkSize int // Chunk size (X) for SHOW BINLOG|RELAYLOG EVENTS LIMIT ?,X statements. Smaller means less locking and mroe work to be done + SkipBinlogEventsContaining []string // When scanning/comparing binlogs for Pseudo-GTID, skip entries containing given texts. These are NOT regular expressions (would consume too much CPU while scanning binlogs), just substrings to find. + ReduceReplicationAnalysisCount bool // When true, replication analysis will only report instances where possibility of handled problems is possible in the first place (e.g. will not report most leaf nodes, that are mostly uninteresting). When false, provides an entry for every known instance + FailureDetectionPeriodBlockMinutes int // The time for which an instance's failure discovery is kept "active", so as to avoid concurrent "discoveries" of the instance's failure; this preceeds any recovery process, if any. + RecoveryPeriodBlockMinutes int // (supported for backwards compatibility but please use newer `RecoveryPeriodBlockSeconds` instead) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping + RecoveryPeriodBlockSeconds int // (overrides `RecoveryPeriodBlockMinutes`) The time for which an instance's recovery is kept "active", so as to avoid concurrent recoveries on smae instance as well as flapping + RecoveryIgnoreHostnameFilters []string // Recovery analysis will completely ignore hosts matching given patterns + RecoverMasterClusterFilters []string // Only do master recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything) + RecoverIntermediateMasterClusterFilters []string // Only do IM recovery on clusters matching these regexp patterns (of course the ".*" pattern matches everything) + ProcessesShellCommand string // Shell that executes command scripts + OnFailureDetectionProcesses []string // Processes to execute when detecting a failover scenario (before making a decision whether to failover or not). May and should use some of these placeholders: {failureType}, {instanceType}, {isMaster}, {isCoMaster}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {autoMasterRecovery}, {autoIntermediateMasterRecovery} + PreGracefulTakeoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isMaster}, {isCoMaster}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {countReplicas}, {replicaHosts}, {isDowntimed} + PreFailoverProcesses []string // Processes to execute before doing a failover (aborting operation should any once of them exits with non-zero code; order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isMaster}, {isCoMaster}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {countReplicas}, {replicaHosts}, {isDowntimed} + PostFailoverProcesses []string // Processes to execute after doing a failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isMaster}, {isCoMaster}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} + PostUnsuccessfulFailoverProcesses []string // Processes to execute after a not-completely-successful failover (order of execution undefined). May and should use some of these placeholders: {failureType}, {instanceType}, {isMaster}, {isCoMaster}, {failureDescription}, {command}, {failedHost}, {failureCluster}, {failureClusterAlias}, {failureClusterDomain}, {failedPort}, {successorHost}, {successorPort}, {successorAlias}, {countReplicas}, {replicaHosts}, {isDowntimed}, {isSuccessful}, {lostReplicas}, {countLostReplicas} + PostMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses + PostIntermediateMasterFailoverProcesses []string // Processes to execute after doing a master failover (order of execution undefined). Uses same placeholders as PostFailoverProcesses + PostGracefulTakeoverProcesses []string // Processes to execute after runnign a graceful master takeover. Uses same placeholders as PostFailoverProcesses + PostTakeMasterProcesses []string // Processes to execute after a successful Take-Master event has taken place + CoMasterRecoveryMustPromoteOtherCoMaster bool // When 'false', anything can get promoted (and candidates are prefered over others). When 'true', orchestrator will promote the other co-master or else fail + DetachLostSlavesAfterMasterFailover bool // synonym to DetachLostReplicasAfterMasterFailover + DetachLostReplicasAfterMasterFailover bool // Should replicas that are not to be lost in master recovery (i.e. were more up-to-date than promoted replica) be forcibly detached + ApplyMySQLPromotionAfterMasterFailover bool // Should orchestrator take upon itself to apply MySQL master promotion: set read_only=0, detach replication, etc. + PreventCrossDataCenterMasterFailover bool // When true (default: false), cross-DC master failover are not allowed, orchestrator will do all it can to only fail over within same DC, or else not fail over at all. + PreventCrossRegionMasterFailover bool // When true (default: false), cross-region master failover are not allowed, orchestrator will do all it can to only fail over within same region, or else not fail over at all. + MasterFailoverLostInstancesDowntimeMinutes uint // Number of minutes to downtime any server that was lost after a master failover (including failed master & lost replicas). 0 to disable + MasterFailoverDetachSlaveMasterHost bool // synonym to MasterFailoverDetachReplicaMasterHost + MasterFailoverDetachReplicaMasterHost bool // Should orchestrator issue a detach-replica-master-host on newly promoted master (this makes sure the new master will not attempt to replicate old master if that comes back to life). Defaults 'false'. Meaningless if ApplyMySQLPromotionAfterMasterFailover is 'true'. + FailMasterPromotionOnLagMinutes uint // when > 0, fail a master promotion if the candidate replica is lagging >= configured number of minutes. + FailMasterPromotionIfSQLThreadNotUpToDate bool // when true, and a master failover takes place, if candidate master has not consumed all relay logs, promotion is aborted with error + DelayMasterPromotionIfSQLThreadNotUpToDate bool // when true, and a master failover takes place, if candidate master has not consumed all relay logs, delay promotion until the sql thread has caught up + PostponeSlaveRecoveryOnLagMinutes uint // Synonym to PostponeReplicaRecoveryOnLagMinutes + PostponeReplicaRecoveryOnLagMinutes uint // On crash recovery, replicas that are lagging more than given minutes are only resurrected late in the recovery process, after master/IM has been elected and processes executed. Value of 0 disables this feature + OSCIgnoreHostnameFilters []string // OSC replicas recommendation will ignore replica hostnames matching given patterns + GraphiteAddr string // Optional; address of graphite port. If supplied, metrics will be written here + GraphitePath string // Prefix for graphite path. May include {hostname} magic placeholder + GraphiteConvertHostnameDotsToUnderscores bool // If true, then hostname's dots are converted to underscores before being used in graphite path + GraphitePollSeconds int // Graphite writes interval. 0 disables. + URLPrefix string // URL prefix to run orchestrator on non-root web path, e.g. /orchestrator to put it behind nginx. + DiscoveryIgnoreReplicaHostnameFilters []string // Regexp filters to apply to prevent auto-discovering new replicas. Usage: unreachable servers due to firewalls, applications which trigger binlog dumps + DiscoveryIgnoreMasterHostnameFilters []string // Regexp filters to apply to prevent auto-discovering a master. Usage: pointing your master temporarily to replicate seom data from external host + DiscoveryIgnoreHostnameFilters []string // Regexp filters to apply to prevent discovering instances of any kind + ConsulAddress string // Address where Consul HTTP api is found. Example: 127.0.0.1:8500 + ConsulScheme string // Scheme (http or https) for Consul + ConsulAclToken string // ACL token used to write to Consul KV + ConsulCrossDataCenterDistribution bool // should orchestrator automatically auto-deduce all consul DCs and write KVs in all DCs + ZkAddress string // UNSUPPERTED YET. Address where (single or multiple) ZooKeeper servers are found, in `srv1[:port1][,srv2[:port2]...]` format. Default port is 2181. Example: srv-a,srv-b:12181,srv-c + KVClusterMasterPrefix string // Prefix to use for clusters' masters entries in KV stores (internal, consul, ZK), default: "mysql/master" + WebMessage string // If provided, will be shown on all web pages below the title bar + MaxConcurrentReplicaOperations int // Maximum number of concurrent operations on replicas +} + +// ToJSONString will marshal this configuration as JSON +func (this *Configuration) ToJSONString() string { + b, _ := json.Marshal(this) + return string(b) +} + +// Config is *the* configuration instance, used globally to get configuration data +var Config = newConfiguration() +var readFileNames []string + +func newConfiguration() *Configuration { + return &Configuration{ + Debug: false, + EnableSyslog: false, + ListenAddress: ":3000", + ListenSocket: "", + HTTPAdvertise: "", + AgentsServerPort: ":3001", + StatusEndpoint: DefaultStatusAPIEndpoint, + StatusOUVerify: false, + BackendDB: "mysql", + SQLite3DataFile: "", + SkipOrchestratorDatabaseUpdate: false, + PanicIfDifferentDatabaseDeploy: false, + RaftBind: "127.0.0.1:10008", + RaftAdvertise: "", + RaftDataDir: "", + DefaultRaftPort: 10008, + RaftNodes: []string{}, + ExpectFailureAnalysisConcensus: true, + MySQLOrchestratorMaxPoolConnections: 128, // limit concurrent conns to backend DB + MySQLOrchestratorPort: 3306, + MySQLTopologyUseMutualTLS: false, + MySQLTopologyUseMixedTLS: true, + MySQLOrchestratorUseMutualTLS: false, + MySQLConnectTimeoutSeconds: 2, + MySQLOrchestratorReadTimeoutSeconds: 30, + MySQLOrchestratorRejectReadOnly: false, + MySQLDiscoveryReadTimeoutSeconds: 10, + MySQLTopologyReadTimeoutSeconds: 600, + MySQLConnectionLifetimeSeconds: 0, + DefaultInstancePort: 3306, + TLSCacheTTLFactor: 100, + InstancePollSeconds: 5, + InstanceWriteBufferSize: 100, + BufferInstanceWrites: false, + InstanceFlushIntervalMilliseconds: 100, + SkipMaxScaleCheck: true, + UnseenInstanceForgetHours: 240, + SnapshotTopologiesIntervalHours: 0, + DiscoverByShowSlaveHosts: false, + UseSuperReadOnly: false, + DiscoveryMaxConcurrency: 300, + DiscoveryQueueCapacity: 100000, + DiscoveryQueueMaxStatisticsSize: 120, + DiscoveryCollectionRetentionSeconds: 120, + DiscoverySeeds: []string{}, + InstanceBulkOperationsWaitTimeoutSeconds: 10, + HostnameResolveMethod: "default", + MySQLHostnameResolveMethod: "@@hostname", + SkipBinlogServerUnresolveCheck: true, + ExpiryHostnameResolvesMinutes: 60, + RejectHostnameResolvePattern: "", + ReasonableReplicationLagSeconds: 10, + ProblemIgnoreHostnameFilters: []string{}, + VerifyReplicationFilters: false, + ReasonableMaintenanceReplicationLagSeconds: 20, + CandidateInstanceExpireMinutes: 60, + AuditLogFile: "", + AuditToSyslog: false, + AuditToBackendDB: false, + AuditPurgeDays: 7, + RemoveTextFromHostnameDisplay: "", + ReadOnly: false, + AuthenticationMethod: "", + HTTPAuthUser: "", + HTTPAuthPassword: "", + AuthUserHeader: "X-Forwarded-User", + PowerAuthUsers: []string{"*"}, + PowerAuthGroups: []string{}, + AccessTokenUseExpirySeconds: 60, + AccessTokenExpiryMinutes: 1440, + ClusterNameToAlias: make(map[string]string), + DetectClusterAliasQuery: "", + DetectClusterDomainQuery: "", + DetectInstanceAliasQuery: "", + DetectPromotionRuleQuery: "", + DataCenterPattern: "", + PhysicalEnvironmentPattern: "", + DetectDataCenterQuery: "", + DetectPhysicalEnvironmentQuery: "", + DetectSemiSyncEnforcedQuery: "", + SupportFuzzyPoolHostnames: true, + InstancePoolExpiryMinutes: 60, + PromotionIgnoreHostnameFilters: []string{}, + ServeAgentsHttp: false, + AgentsUseSSL: false, + AgentsUseMutualTLS: false, + AgentSSLValidOUs: []string{}, + AgentSSLSkipVerify: false, + AgentSSLPrivateKeyFile: "", + AgentSSLCertFile: "", + AgentSSLCAFile: "", + UseSSL: false, + UseMutualTLS: false, + SSLValidOUs: []string{}, + SSLSkipVerify: false, + SSLPrivateKeyFile: "", + SSLCertFile: "", + SSLCAFile: "", + AgentPollMinutes: 60, + UnseenAgentForgetHours: 6, + StaleSeedFailMinutes: 60, + SeedAcceptableBytesDiff: 8192, + SeedWaitSecondsBeforeSend: 2, + AutoPseudoGTID: false, + PseudoGTIDPattern: "", + PseudoGTIDPatternIsFixedSubstring: false, + PseudoGTIDMonotonicHint: "", + DetectPseudoGTIDQuery: "", + BinlogEventsChunkSize: 10000, + SkipBinlogEventsContaining: []string{}, + ReduceReplicationAnalysisCount: true, + FailureDetectionPeriodBlockMinutes: 60, + RecoveryPeriodBlockMinutes: 60, + RecoveryPeriodBlockSeconds: 3600, + RecoveryIgnoreHostnameFilters: []string{}, + RecoverMasterClusterFilters: []string{}, + RecoverIntermediateMasterClusterFilters: []string{}, + ProcessesShellCommand: "bash", + OnFailureDetectionProcesses: []string{}, + PreGracefulTakeoverProcesses: []string{}, + PreFailoverProcesses: []string{}, + PostMasterFailoverProcesses: []string{}, + PostIntermediateMasterFailoverProcesses: []string{}, + PostFailoverProcesses: []string{}, + PostUnsuccessfulFailoverProcesses: []string{}, + PostGracefulTakeoverProcesses: []string{}, + PostTakeMasterProcesses: []string{}, + CoMasterRecoveryMustPromoteOtherCoMaster: true, + DetachLostSlavesAfterMasterFailover: true, + ApplyMySQLPromotionAfterMasterFailover: true, + PreventCrossDataCenterMasterFailover: false, + PreventCrossRegionMasterFailover: false, + MasterFailoverLostInstancesDowntimeMinutes: 0, + MasterFailoverDetachSlaveMasterHost: false, + FailMasterPromotionOnLagMinutes: 0, + FailMasterPromotionIfSQLThreadNotUpToDate: false, + DelayMasterPromotionIfSQLThreadNotUpToDate: false, + PostponeSlaveRecoveryOnLagMinutes: 0, + OSCIgnoreHostnameFilters: []string{}, + GraphiteAddr: "", + GraphitePath: "", + GraphiteConvertHostnameDotsToUnderscores: true, + GraphitePollSeconds: 60, + URLPrefix: "", + DiscoveryIgnoreReplicaHostnameFilters: []string{}, + ConsulAddress: "", + ConsulScheme: "http", + ConsulAclToken: "", + ConsulCrossDataCenterDistribution: false, + ZkAddress: "", + KVClusterMasterPrefix: "mysql/master", + WebMessage: "", + MaxConcurrentReplicaOperations: 5, + } +} + +func (this *Configuration) postReadAdjustments() error { + if this.MySQLOrchestratorCredentialsConfigFile != "" { + mySQLConfig := struct { + Client struct { + User string + Password string + } + }{} + err := gcfg.ReadFileInto(&mySQLConfig, this.MySQLOrchestratorCredentialsConfigFile) + if err != nil { + log.Fatalf("Failed to parse gcfg data from file: %+v", err) + } else { + log.Debugf("Parsed orchestrator credentials from %s", this.MySQLOrchestratorCredentialsConfigFile) + this.MySQLOrchestratorUser = mySQLConfig.Client.User + this.MySQLOrchestratorPassword = mySQLConfig.Client.Password + } + } + { + // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull + // the given variable from os env + submatch := envVariableRegexp.FindStringSubmatch(this.MySQLOrchestratorPassword) + if len(submatch) > 1 { + this.MySQLOrchestratorPassword = os.Getenv(submatch[1]) + } + } + if this.MySQLTopologyCredentialsConfigFile != "" { + mySQLConfig := struct { + Client struct { + User string + Password string + } + }{} + err := gcfg.ReadFileInto(&mySQLConfig, this.MySQLTopologyCredentialsConfigFile) + if err != nil { + log.Fatalf("Failed to parse gcfg data from file: %+v", err) + } else { + log.Debugf("Parsed topology credentials from %s", this.MySQLTopologyCredentialsConfigFile) + this.MySQLTopologyUser = mySQLConfig.Client.User + this.MySQLTopologyPassword = mySQLConfig.Client.Password + } + } + { + // We accept password in the form "${SOME_ENV_VARIABLE}" in which case we pull + // the given variable from os env + submatch := envVariableRegexp.FindStringSubmatch(this.MySQLTopologyPassword) + if len(submatch) > 1 { + this.MySQLTopologyPassword = os.Getenv(submatch[1]) + } + } + + if this.RecoveryPeriodBlockSeconds == 0 && this.RecoveryPeriodBlockMinutes > 0 { + // RecoveryPeriodBlockSeconds is a newer addition that overrides RecoveryPeriodBlockMinutes + // The code does not consider RecoveryPeriodBlockMinutes anymore, but RecoveryPeriodBlockMinutes + // still supported in config file for backwards compatibility + this.RecoveryPeriodBlockSeconds = this.RecoveryPeriodBlockMinutes * 60 + } + + { + if this.ReplicationLagQuery != "" && this.SlaveLagQuery != "" && this.ReplicationLagQuery != this.SlaveLagQuery { + return fmt.Errorf("config's ReplicationLagQuery and SlaveLagQuery are synonyms and cannot both be defined") + } + // ReplicationLagQuery is the replacement param to SlaveLagQuery + if this.ReplicationLagQuery == "" { + this.ReplicationLagQuery = this.SlaveLagQuery + } + // We reset SlaveLagQuery because we want to support multiple config file loading; + // One of the next config files may indicate a new value for ReplicationLagQuery. + // If we do not reset SlaveLagQuery, then the two will have a conflict. + this.SlaveLagQuery = "" + } + + { + if this.DetachLostSlavesAfterMasterFailover { + this.DetachLostReplicasAfterMasterFailover = true + } + } + + { + if this.MasterFailoverDetachSlaveMasterHost { + this.MasterFailoverDetachReplicaMasterHost = true + } + } + if this.FailMasterPromotionIfSQLThreadNotUpToDate && this.DelayMasterPromotionIfSQLThreadNotUpToDate { + return fmt.Errorf("Cannot have both FailMasterPromotionIfSQLThreadNotUpToDate and DelayMasterPromotionIfSQLThreadNotUpToDate enabled") + } + if this.FailMasterPromotionOnLagMinutes > 0 && this.ReplicationLagQuery == "" { + return fmt.Errorf("nonzero FailMasterPromotionOnLagMinutes requires ReplicationLagQuery to be set") + } + { + if this.PostponeReplicaRecoveryOnLagMinutes != 0 && this.PostponeSlaveRecoveryOnLagMinutes != 0 && + this.PostponeReplicaRecoveryOnLagMinutes != this.PostponeSlaveRecoveryOnLagMinutes { + return fmt.Errorf("config's PostponeReplicaRecoveryOnLagMinutes and PostponeSlaveRecoveryOnLagMinutes are synonyms and cannot both be defined") + } + if this.PostponeSlaveRecoveryOnLagMinutes != 0 { + this.PostponeReplicaRecoveryOnLagMinutes = this.PostponeSlaveRecoveryOnLagMinutes + } + } + + if this.URLPrefix != "" { + // Ensure the prefix starts with "/" and has no trailing one. + this.URLPrefix = strings.TrimLeft(this.URLPrefix, "/") + this.URLPrefix = strings.TrimRight(this.URLPrefix, "/") + this.URLPrefix = "/" + this.URLPrefix + } + + if this.IsSQLite() && this.SQLite3DataFile == "" { + return fmt.Errorf("SQLite3DataFile must be set when BackendDB is sqlite3") + } + if this.IsSQLite() { + // this.HostnameResolveMethod = "none" + } + if this.RaftEnabled && this.RaftDataDir == "" { + return fmt.Errorf("RaftDataDir must be defined since raft is enabled (RaftEnabled)") + } + if this.RaftEnabled && this.RaftBind == "" { + return fmt.Errorf("RaftBind must be defined since raft is enabled (RaftEnabled)") + } + if this.RaftAdvertise == "" { + this.RaftAdvertise = this.RaftBind + } + if this.KVClusterMasterPrefix != "/" { + // "/" remains "/" + // "prefix" turns to "prefix/" + // "some/prefix///" turns to "some/prefix/" + this.KVClusterMasterPrefix = strings.TrimRight(this.KVClusterMasterPrefix, "/") + this.KVClusterMasterPrefix = fmt.Sprintf("%s/", this.KVClusterMasterPrefix) + } + if this.AutoPseudoGTID { + this.PseudoGTIDPattern = "drop view if exists `_pseudo_gtid_`" + this.PseudoGTIDPatternIsFixedSubstring = true + this.PseudoGTIDMonotonicHint = "asc:" + this.DetectPseudoGTIDQuery = SelectTrueQuery + } + if this.HTTPAdvertise != "" { + u, err := url.Parse(this.HTTPAdvertise) + if err != nil { + return fmt.Errorf("Failed parsing HTTPAdvertise %s: %s", this.HTTPAdvertise, err.Error()) + } + if u.Scheme == "" { + return fmt.Errorf("If specified, HTTPAdvertise must include scheme (http:// or https://)") + } + if u.Hostname() == "" { + return fmt.Errorf("If specified, HTTPAdvertise must include host name") + } + if u.Port() == "" { + return fmt.Errorf("If specified, HTTPAdvertise must include port number") + } + if u.Path != "" { + return fmt.Errorf("If specified, HTTPAdvertise must not specify a path") + } + if this.InstanceWriteBufferSize <= 0 { + this.BufferInstanceWrites = false + } + } + return nil +} + +func (this *Configuration) IsSQLite() bool { + return strings.Contains(this.BackendDB, "sqlite") +} + +func (this *Configuration) IsMySQL() bool { + return this.BackendDB == "mysql" || this.BackendDB == "" +} + +// read reads configuration from given file, or silently skips if the file does not exist. +// If the file does exist, then it is expected to be in valid JSON format or the function bails out. +func read(fileName string) (*Configuration, error) { + if fileName == "" { + return Config, fmt.Errorf("Empty file name") + } + file, err := os.Open(fileName) + if err != nil { + return Config, err + } + decoder := json.NewDecoder(file) + err = decoder.Decode(Config) + if err == nil { + log.Infof("Read config: %s", fileName) + } else { + log.Fatal("Cannot read config file:", fileName, err) + } + if err := Config.postReadAdjustments(); err != nil { + log.Fatale(err) + } + return Config, err +} + +// Read reads configuration from zero, either, some or all given files, in order of input. +// A file can override configuration provided in previous file. +func Read(fileNames ...string) *Configuration { + for _, fileName := range fileNames { + read(fileName) + } + readFileNames = fileNames + return Config +} + +// ForceRead reads configuration from given file name or bails out if it fails +func ForceRead(fileName string) *Configuration { + _, err := read(fileName) + if err != nil { + log.Fatal("Cannot read config file:", fileName, err) + } + readFileNames = []string{fileName} + return Config +} + +// Reload re-reads configuration from last used files +func Reload(extraFileNames ...string) *Configuration { + for _, fileName := range readFileNames { + read(fileName) + } + for _, fileName := range extraFileNames { + read(fileName) + } + return Config +} + +// MarkConfigurationLoaded is called once configuration has first been loaded. +// Listeners on ConfigurationLoaded will get a notification +func MarkConfigurationLoaded() { + go func() { + for { + configurationLoaded <- true + } + }() + // wait for it + <-configurationLoaded +} + +// WaitForConfigurationToBeLoaded does just that. It will return after +// the configuration file has been read off disk. +func WaitForConfigurationToBeLoaded() { + <-configurationLoaded +} diff --git a/go/vt/orchestrator/config/config_test.go b/go/vt/orchestrator/config/config_test.go new file mode 100644 index 0000000000..cb58864f12 --- /dev/null +++ b/go/vt/orchestrator/config/config_test.go @@ -0,0 +1,217 @@ +package config + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + Config.HostnameResolveMethod = "none" + log.SetLevel(log.ERROR) +} + +func TestReplicationLagQuery(t *testing.T) { + { + c := newConfiguration() + c.SlaveLagQuery = "select 3" + c.ReplicationLagQuery = "select 4" + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } + { + c := newConfiguration() + c.SlaveLagQuery = "select 3" + c.ReplicationLagQuery = "select 3" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + } + { + c := newConfiguration() + c.SlaveLagQuery = "select 3" + c.ReplicationLagQuery = "" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.ReplicationLagQuery, "select 3") + } +} + +func TestPostponeReplicaRecoveryOnLagMinutes(t *testing.T) { + { + c := newConfiguration() + c.PostponeSlaveRecoveryOnLagMinutes = 3 + c.PostponeReplicaRecoveryOnLagMinutes = 5 + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } + { + c := newConfiguration() + c.PostponeSlaveRecoveryOnLagMinutes = 3 + c.PostponeReplicaRecoveryOnLagMinutes = 3 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + } + { + c := newConfiguration() + c.PostponeSlaveRecoveryOnLagMinutes = 3 + c.PostponeReplicaRecoveryOnLagMinutes = 0 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.PostponeReplicaRecoveryOnLagMinutes, uint(3)) + } +} + +func TestMasterFailoverDetachReplicaMasterHost(t *testing.T) { + { + c := newConfiguration() + c.MasterFailoverDetachSlaveMasterHost = false + c.MasterFailoverDetachReplicaMasterHost = false + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectFalse(c.MasterFailoverDetachReplicaMasterHost) + } + { + c := newConfiguration() + c.MasterFailoverDetachSlaveMasterHost = false + c.MasterFailoverDetachReplicaMasterHost = true + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(c.MasterFailoverDetachReplicaMasterHost) + } + { + c := newConfiguration() + c.MasterFailoverDetachSlaveMasterHost = true + c.MasterFailoverDetachReplicaMasterHost = false + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(c.MasterFailoverDetachReplicaMasterHost) + } +} + +func TestMasterFailoverDetachDetachLostReplicasAfterMasterFailover(t *testing.T) { + { + c := newConfiguration() + c.DetachLostSlavesAfterMasterFailover = false + c.DetachLostReplicasAfterMasterFailover = false + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectFalse(c.DetachLostReplicasAfterMasterFailover) + } + { + c := newConfiguration() + c.DetachLostSlavesAfterMasterFailover = false + c.DetachLostReplicasAfterMasterFailover = true + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(c.DetachLostReplicasAfterMasterFailover) + } + { + c := newConfiguration() + c.DetachLostSlavesAfterMasterFailover = true + c.DetachLostReplicasAfterMasterFailover = false + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(c.DetachLostReplicasAfterMasterFailover) + } +} + +func TestRecoveryPeriodBlock(t *testing.T) { + { + c := newConfiguration() + c.RecoveryPeriodBlockSeconds = 0 + c.RecoveryPeriodBlockMinutes = 0 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.RecoveryPeriodBlockSeconds, 0) + } + { + c := newConfiguration() + c.RecoveryPeriodBlockSeconds = 30 + c.RecoveryPeriodBlockMinutes = 1 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.RecoveryPeriodBlockSeconds, 30) + } + { + c := newConfiguration() + c.RecoveryPeriodBlockSeconds = 0 + c.RecoveryPeriodBlockMinutes = 2 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.RecoveryPeriodBlockSeconds, 120) + } + { + c := newConfiguration() + c.RecoveryPeriodBlockSeconds = 15 + c.RecoveryPeriodBlockMinutes = 0 + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.RecoveryPeriodBlockSeconds, 15) + } +} + +func TestRaft(t *testing.T) { + { + c := newConfiguration() + c.RaftBind = "1.2.3.4:1008" + c.RaftDataDir = "/path/to/somewhere" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c.RaftAdvertise, c.RaftBind) + } + { + c := newConfiguration() + c.RaftEnabled = true + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } + { + c := newConfiguration() + c.RaftEnabled = true + c.RaftDataDir = "/path/to/somewhere" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + } + { + c := newConfiguration() + c.RaftEnabled = true + c.RaftDataDir = "/path/to/somewhere" + c.RaftBind = "" + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } +} + +func TestHttpAdvertise(t *testing.T) { + { + c := newConfiguration() + c.HTTPAdvertise = "" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + } + { + c := newConfiguration() + c.HTTPAdvertise = "http://127.0.0.1:1234" + err := c.postReadAdjustments() + test.S(t).ExpectNil(err) + } + { + c := newConfiguration() + c.HTTPAdvertise = "http://127.0.0.1" + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } + { + c := newConfiguration() + c.HTTPAdvertise = "127.0.0.1:1234" + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } + { + c := newConfiguration() + c.HTTPAdvertise = "http://127.0.0.1:1234/mypath" + err := c.postReadAdjustments() + test.S(t).ExpectNotNil(err) + } +} diff --git a/go/vt/orchestrator/db/db.go b/go/vt/orchestrator/db/db.go new file mode 100644 index 0000000000..47b1786320 --- /dev/null +++ b/go/vt/orchestrator/db/db.go @@ -0,0 +1,416 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package db + +import ( + "database/sql" + "fmt" + "strings" + "sync" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +var ( + EmptyArgs []interface{} +) + +var mysqlURI string +var dbMutex sync.Mutex + +type DummySqlResult struct { +} + +func (this DummySqlResult) LastInsertId() (int64, error) { + return 0, nil +} + +func (this DummySqlResult) RowsAffected() (int64, error) { + return 1, nil +} + +func getMySQLURI() string { + dbMutex.Lock() + defer dbMutex.Unlock() + if mysqlURI != "" { + return mysqlURI + } + mysqlURI := fmt.Sprintf("%s:%s@tcp(%s:%d)/%s?timeout=%ds&readTimeout=%ds&rejectReadOnly=%t&interpolateParams=true", + config.Config.MySQLOrchestratorUser, + config.Config.MySQLOrchestratorPassword, + config.Config.MySQLOrchestratorHost, + config.Config.MySQLOrchestratorPort, + config.Config.MySQLOrchestratorDatabase, + config.Config.MySQLConnectTimeoutSeconds, + config.Config.MySQLOrchestratorReadTimeoutSeconds, + config.Config.MySQLOrchestratorRejectReadOnly, + ) + if config.Config.MySQLOrchestratorUseMutualTLS { + mysqlURI, _ = SetupMySQLOrchestratorTLS(mysqlURI) + } + return mysqlURI +} + +// OpenDiscovery returns a DB instance to access a topology instance. +// It has lower read timeout than OpenTopology and is intended to +// be used with low-latency discovery queries. +func OpenDiscovery(host string, port int) (*sql.DB, error) { + return openTopology(host, port, config.Config.MySQLDiscoveryReadTimeoutSeconds) +} + +// OpenTopology returns a DB instance to access a topology instance. +func OpenTopology(host string, port int) (*sql.DB, error) { + return openTopology(host, port, config.Config.MySQLTopologyReadTimeoutSeconds) +} + +func openTopology(host string, port int, readTimeout int) (db *sql.DB, err error) { + mysql_uri := fmt.Sprintf("%s:%s@tcp(%s:%d)/?timeout=%ds&readTimeout=%ds&interpolateParams=true", + config.Config.MySQLTopologyUser, + config.Config.MySQLTopologyPassword, + host, port, + config.Config.MySQLConnectTimeoutSeconds, + readTimeout, + ) + + if config.Config.MySQLTopologyUseMutualTLS || + (config.Config.MySQLTopologyUseMixedTLS && requiresTLS(host, port, mysql_uri)) { + if mysql_uri, err = SetupMySQLTopologyTLS(mysql_uri); err != nil { + return nil, err + } + } + if db, _, err = sqlutils.GetDB(mysql_uri); err != nil { + return nil, err + } + if config.Config.MySQLConnectionLifetimeSeconds > 0 { + db.SetConnMaxLifetime(time.Duration(config.Config.MySQLConnectionLifetimeSeconds) * time.Second) + } + db.SetMaxOpenConns(config.MySQLTopologyMaxPoolConnections) + db.SetMaxIdleConns(config.MySQLTopologyMaxPoolConnections) + return db, err +} + +func openOrchestratorMySQLGeneric() (db *sql.DB, fromCache bool, err error) { + uri := fmt.Sprintf("%s:%s@tcp(%s:%d)/?timeout=%ds&readTimeout=%ds&interpolateParams=true", + config.Config.MySQLOrchestratorUser, + config.Config.MySQLOrchestratorPassword, + config.Config.MySQLOrchestratorHost, + config.Config.MySQLOrchestratorPort, + config.Config.MySQLConnectTimeoutSeconds, + config.Config.MySQLOrchestratorReadTimeoutSeconds, + ) + if config.Config.MySQLOrchestratorUseMutualTLS { + uri, _ = SetupMySQLOrchestratorTLS(uri) + } + return sqlutils.GetDB(uri) +} + +func IsSQLite() bool { + return config.Config.IsSQLite() +} + +func isInMemorySQLite() bool { + return config.Config.IsSQLite() && strings.Contains(config.Config.SQLite3DataFile, ":memory:") +} + +// OpenTopology returns the DB instance for the orchestrator backed database +func OpenOrchestrator() (db *sql.DB, err error) { + var fromCache bool + if IsSQLite() { + db, fromCache, err = sqlutils.GetSQLiteDB(config.Config.SQLite3DataFile) + if err == nil && !fromCache { + log.Debugf("Connected to orchestrator backend: sqlite on %v", config.Config.SQLite3DataFile) + } + db.SetMaxOpenConns(1) + db.SetMaxIdleConns(1) + } else { + if db, fromCache, err := openOrchestratorMySQLGeneric(); err != nil { + return db, log.Errore(err) + } else if !fromCache { + // first time ever we talk to MySQL + query := fmt.Sprintf("create database if not exists %s", config.Config.MySQLOrchestratorDatabase) + if _, err := db.Exec(query); err != nil { + return db, log.Errore(err) + } + } + db, fromCache, err = sqlutils.GetDB(getMySQLURI()) + if err == nil && !fromCache { + // do not show the password but do show what we connect to. + safeMySQLURI := fmt.Sprintf("%s:?@tcp(%s:%d)/%s?timeout=%ds", config.Config.MySQLOrchestratorUser, + config.Config.MySQLOrchestratorHost, config.Config.MySQLOrchestratorPort, config.Config.MySQLOrchestratorDatabase, config.Config.MySQLConnectTimeoutSeconds) + log.Debugf("Connected to orchestrator backend: %v", safeMySQLURI) + if config.Config.MySQLOrchestratorMaxPoolConnections > 0 { + log.Debugf("Orchestrator pool SetMaxOpenConns: %d", config.Config.MySQLOrchestratorMaxPoolConnections) + db.SetMaxOpenConns(config.Config.MySQLOrchestratorMaxPoolConnections) + } + if config.Config.MySQLConnectionLifetimeSeconds > 0 { + db.SetConnMaxLifetime(time.Duration(config.Config.MySQLConnectionLifetimeSeconds) * time.Second) + } + } + } + if err == nil && !fromCache { + if !config.Config.SkipOrchestratorDatabaseUpdate { + initOrchestratorDB(db) + } + // A low value here will trigger reconnects which could + // make the number of backend connections hit the tcp + // limit. That's bad. I could make this setting dynamic + // but then people need to know which value to use. For now + // allow up to 25% of MySQLOrchestratorMaxPoolConnections + // to be idle. That should provide a good number which + // does not keep the maximum number of connections open but + // at the same time does not trigger disconnections and + // reconnections too frequently. + maxIdleConns := int(config.Config.MySQLOrchestratorMaxPoolConnections * 25 / 100) + if maxIdleConns < 10 { + maxIdleConns = 10 + } + log.Infof("Connecting to backend %s:%d: maxConnections: %d, maxIdleConns: %d", + config.Config.MySQLOrchestratorHost, + config.Config.MySQLOrchestratorPort, + config.Config.MySQLOrchestratorMaxPoolConnections, + maxIdleConns) + db.SetMaxIdleConns(maxIdleConns) + } + return db, err +} + +func translateStatement(statement string) (string, error) { + if IsSQLite() { + statement = sqlutils.ToSqlite3Dialect(statement) + } + return statement, nil +} + +// versionIsDeployed checks if given version has already been deployed +func versionIsDeployed(db *sql.DB) (result bool, err error) { + query := ` + select + count(*) as is_deployed + from + orchestrator_db_deployments + where + deployed_version = ? + ` + err = db.QueryRow(query, config.RuntimeCLIFlags.ConfiguredVersion).Scan(&result) + // err means the table 'orchestrator_db_deployments' does not even exist, in which case we proceed + // to deploy. + // If there's another error to this, like DB gone bad, then we're about to find out anyway. + return result, err +} + +// registerOrchestratorDeployment updates the orchestrator_metadata table upon successful deployment +func registerOrchestratorDeployment(db *sql.DB) error { + query := ` + replace into orchestrator_db_deployments ( + deployed_version, deployed_timestamp + ) values ( + ?, NOW() + ) + ` + if _, err := execInternal(db, query, config.RuntimeCLIFlags.ConfiguredVersion); err != nil { + log.Fatalf("Unable to write to orchestrator_metadata: %+v", err) + } + log.Debugf("Migrated database schema to version [%+v]", config.RuntimeCLIFlags.ConfiguredVersion) + return nil +} + +// deployStatements will issue given sql queries that are not already known to be deployed. +// This iterates both lists (to-run and already-deployed) and also verifies no contraditions. +func deployStatements(db *sql.DB, queries []string) error { + tx, err := db.Begin() + if err != nil { + log.Fatale(err) + } + // Ugly workaround ahead. + // Origin of this workaround is the existence of some "timestamp NOT NULL," column definitions, + // where in NO_ZERO_IN_DATE,NO_ZERO_DATE sql_mode are invalid (since default is implicitly "0") + // This means installation of orchestrator fails on such configured servers, and in particular on 5.7 + // where this setting is the dfault. + // For purpose of backwards compatability, what we do is force sql_mode to be more relaxed, create the schemas + // along with the "invalid" definition, and then go ahead and fix those definitions via following ALTER statements. + // My bad. + originalSqlMode := "" + if config.Config.IsMySQL() { + err = tx.QueryRow(`select @@session.sql_mode`).Scan(&originalSqlMode) + if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_DATE', '')`); err != nil { + log.Fatale(err) + } + if _, err := tx.Exec(`set @@session.sql_mode=REPLACE(@@session.sql_mode, 'NO_ZERO_IN_DATE', '')`); err != nil { + log.Fatale(err) + } + } + for i, query := range queries { + if i == 0 { + //log.Debugf("sql_mode is: %+v", originalSqlMode) + } + + query, err := translateStatement(query) + if err != nil { + return log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) + } + if _, err := tx.Exec(query); err != nil { + if strings.Contains(err.Error(), "syntax error") { + return log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) + } + if !sqlutils.IsAlterTable(query) && !sqlutils.IsCreateIndex(query) && !sqlutils.IsDropIndex(query) { + return log.Fatalf("Cannot initiate orchestrator: %+v; query=%+v", err, query) + } + if !strings.Contains(err.Error(), "duplicate column name") && + !strings.Contains(err.Error(), "Duplicate column name") && + !strings.Contains(err.Error(), "check that column/key exists") && + !strings.Contains(err.Error(), "already exists") && + !strings.Contains(err.Error(), "Duplicate key name") { + log.Errorf("Error initiating orchestrator: %+v; query=%+v", err, query) + } + } + } + if config.Config.IsMySQL() { + if _, err := tx.Exec(`set session sql_mode=?`, originalSqlMode); err != nil { + log.Fatale(err) + } + } + if err := tx.Commit(); err != nil { + log.Fatale(err) + } + return nil +} + +// initOrchestratorDB attempts to create/upgrade the orchestrator backend database. It is created once in the +// application's lifetime. +func initOrchestratorDB(db *sql.DB) error { + log.Debug("Initializing orchestrator") + + versionAlreadyDeployed, err := versionIsDeployed(db) + if versionAlreadyDeployed && config.RuntimeCLIFlags.ConfiguredVersion != "" && err == nil { + // Already deployed with this version + return nil + } + if config.Config.PanicIfDifferentDatabaseDeploy && config.RuntimeCLIFlags.ConfiguredVersion != "" && !versionAlreadyDeployed { + log.Fatalf("PanicIfDifferentDatabaseDeploy is set. Configured version %s is not the version found in the database", config.RuntimeCLIFlags.ConfiguredVersion) + } + log.Debugf("Migrating database schema") + deployStatements(db, generateSQLBase) + deployStatements(db, generateSQLPatches) + registerOrchestratorDeployment(db) + + if IsSQLite() { + ExecOrchestrator(`PRAGMA journal_mode = WAL`) + ExecOrchestrator(`PRAGMA synchronous = NORMAL`) + } + + return nil +} + +// execInternal +func execInternal(db *sql.DB, query string, args ...interface{}) (sql.Result, error) { + var err error + query, err = translateStatement(query) + if err != nil { + return nil, err + } + res, err := sqlutils.ExecNoPrepare(db, query, args...) + return res, err +} + +// ExecOrchestrator will execute given query on the orchestrator backend database. +func ExecOrchestrator(query string, args ...interface{}) (sql.Result, error) { + var err error + query, err = translateStatement(query) + if err != nil { + return nil, err + } + db, err := OpenOrchestrator() + if err != nil { + return nil, err + } + res, err := sqlutils.ExecNoPrepare(db, query, args...) + return res, err +} + +// QueryRowsMapOrchestrator +func QueryOrchestratorRowsMap(query string, on_row func(sqlutils.RowMap) error) error { + query, err := translateStatement(query) + if err != nil { + return log.Fatalf("Cannot query orchestrator: %+v; query=%+v", err, query) + } + db, err := OpenOrchestrator() + if err != nil { + return err + } + + return sqlutils.QueryRowsMap(db, query, on_row) +} + +// QueryOrchestrator +func QueryOrchestrator(query string, argsArray []interface{}, on_row func(sqlutils.RowMap) error) error { + query, err := translateStatement(query) + if err != nil { + return log.Fatalf("Cannot query orchestrator: %+v; query=%+v", err, query) + } + db, err := OpenOrchestrator() + if err != nil { + return err + } + + return log.Criticale(sqlutils.QueryRowsMap(db, query, on_row, argsArray...)) +} + +// QueryOrchestratorRowsMapBuffered +func QueryOrchestratorRowsMapBuffered(query string, on_row func(sqlutils.RowMap) error) error { + query, err := translateStatement(query) + if err != nil { + return log.Fatalf("Cannot query orchestrator: %+v; query=%+v", err, query) + } + db, err := OpenOrchestrator() + if err != nil { + return err + } + + return sqlutils.QueryRowsMapBuffered(db, query, on_row) +} + +// QueryOrchestratorBuffered +func QueryOrchestratorBuffered(query string, argsArray []interface{}, on_row func(sqlutils.RowMap) error) error { + query, err := translateStatement(query) + if err != nil { + return log.Fatalf("Cannot query orchestrator: %+v; query=%+v", err, query) + } + db, err := OpenOrchestrator() + if err != nil { + return err + } + + if argsArray == nil { + argsArray = EmptyArgs + } + return log.Criticale(sqlutils.QueryRowsMapBuffered(db, query, on_row, argsArray...)) +} + +// ReadTimeNow reads and returns the current timestamp as string. This is an unfortunate workaround +// to support both MySQL and SQLite in all possible timezones. SQLite only speaks UTC where MySQL has +// timezone support. By reading the time as string we get the database's de-facto notion of the time, +// which we can then feed back to it. +func ReadTimeNow() (timeNow string, err error) { + err = QueryOrchestrator(`select now() as time_now`, nil, func(m sqlutils.RowMap) error { + timeNow = m.GetString("time_now") + return nil + }) + return timeNow, err +} diff --git a/go/vt/orchestrator/db/generate_base.go b/go/vt/orchestrator/db/generate_base.go new file mode 100644 index 0000000000..ee99371f71 --- /dev/null +++ b/go/vt/orchestrator/db/generate_base.go @@ -0,0 +1,854 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package db + +// generateSQLBase & generateSQLPatches are lists of SQL statements required to build the orchestrator backend +var generateSQLBase = []string{ + ` + CREATE TABLE IF NOT EXISTS database_instance ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + last_checked timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_seen timestamp NULL DEFAULT NULL, + server_id int(10) unsigned NOT NULL, + version varchar(128) CHARACTER SET ascii NOT NULL, + binlog_format varchar(16) CHARACTER SET ascii NOT NULL, + log_bin tinyint(3) unsigned NOT NULL, + log_slave_updates tinyint(3) unsigned NOT NULL, + binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, + binary_log_pos bigint(20) unsigned NOT NULL, + master_host varchar(128) CHARACTER SET ascii NOT NULL, + master_port smallint(5) unsigned NOT NULL, + slave_sql_running tinyint(3) unsigned NOT NULL, + slave_io_running tinyint(3) unsigned NOT NULL, + master_log_file varchar(128) CHARACTER SET ascii NOT NULL, + read_master_log_pos bigint(20) unsigned NOT NULL, + relay_master_log_file varchar(128) CHARACTER SET ascii NOT NULL, + exec_master_log_pos bigint(20) unsigned NOT NULL, + seconds_behind_master bigint(20) unsigned DEFAULT NULL, + slave_lag_seconds bigint(20) unsigned DEFAULT NULL, + num_slave_hosts int(10) unsigned NOT NULL, + slave_hosts text CHARACTER SET ascii NOT NULL, + cluster_name varchar(128) CHARACTER SET ascii NOT NULL, + PRIMARY KEY (hostname,port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX cluster_name_idx ON database_instance + `, + ` + CREATE INDEX cluster_name_idx_database_instance ON database_instance(cluster_name) + `, + ` + DROP INDEX last_checked_idx ON database_instance + `, + ` + CREATE INDEX last_checked_idx_database_instance ON database_instance(last_checked) + `, + ` + DROP INDEX last_seen_idx ON database_instance + `, + ` + CREATE INDEX last_seen_idx_database_instance ON database_instance(last_seen) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_maintenance ( + database_instance_maintenance_id int(10) unsigned NOT NULL AUTO_INCREMENT, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + maintenance_active tinyint(4) DEFAULT NULL, + begin_timestamp timestamp NULL DEFAULT NULL, + end_timestamp timestamp NULL DEFAULT NULL, + owner varchar(128) CHARACTER SET utf8 NOT NULL, + reason text CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (database_instance_maintenance_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX maintenance_uidx ON database_instance_maintenance + `, + ` + CREATE UNIQUE INDEX maintenance_uidx_database_instance_maintenance ON database_instance_maintenance (maintenance_active, hostname, port) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_long_running_queries ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + process_id bigint(20) NOT NULL, + process_started_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + process_user varchar(16) CHARACTER SET utf8 NOT NULL, + process_host varchar(128) CHARACTER SET utf8 NOT NULL, + process_db varchar(128) CHARACTER SET utf8 NOT NULL, + process_command varchar(16) CHARACTER SET utf8 NOT NULL, + process_time_seconds int(11) NOT NULL, + process_state varchar(128) CHARACTER SET utf8 NOT NULL, + process_info varchar(1024) CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (hostname,port,process_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX process_started_at_idx ON database_instance_long_running_queries + `, + ` + CREATE INDEX process_started_at_idx_database_instance_long_running_queries ON database_instance_long_running_queries (process_started_at) + `, + ` + CREATE TABLE IF NOT EXISTS audit ( + audit_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, + audit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + audit_type varchar(128) CHARACTER SET ascii NOT NULL, + hostname varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '', + port smallint(5) unsigned NOT NULL, + message text CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (audit_id) + ) ENGINE=InnoDB DEFAULT CHARSET=latin1 + `, + ` + DROP INDEX audit_timestamp_idx ON audit + `, + ` + CREATE INDEX audit_timestamp_idx_audit ON audit (audit_timestamp) + `, + ` + DROP INDEX host_port_idx ON audit + `, + ` + CREATE INDEX host_port_idx_audit ON audit (hostname, port, audit_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS host_agent ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + token varchar(128) NOT NULL, + last_submitted timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_checked timestamp NULL DEFAULT NULL, + last_seen timestamp NULL DEFAULT NULL, + mysql_port smallint(5) unsigned DEFAULT NULL, + count_mysql_snapshots smallint(5) unsigned NOT NULL, + PRIMARY KEY (hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX token_idx ON host_agent + `, + ` + CREATE INDEX token_idx_host_agent ON host_agent (token) + `, + ` + DROP INDEX last_submitted_idx ON host_agent + `, + ` + CREATE INDEX last_submitted_idx_host_agent ON host_agent (last_submitted) + `, + ` + DROP INDEX last_checked_idx ON host_agent + `, + ` + CREATE INDEX last_checked_idx_host_agent ON host_agent (last_checked) + `, + ` + DROP INDEX last_seen_idx ON host_agent + `, + ` + CREATE INDEX last_seen_idx_host_agent ON host_agent (last_seen) + `, + ` + CREATE TABLE IF NOT EXISTS agent_seed ( + agent_seed_id int(10) unsigned NOT NULL AUTO_INCREMENT, + target_hostname varchar(128) NOT NULL, + source_hostname varchar(128) NOT NULL, + start_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + is_complete tinyint(3) unsigned NOT NULL DEFAULT '0', + is_successful tinyint(3) unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (agent_seed_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX target_hostname_idx ON agent_seed + `, + ` + CREATE INDEX target_hostname_idx_agent_seed ON agent_seed (target_hostname,is_complete) + `, + ` + DROP INDEX source_hostname_idx ON agent_seed + `, + ` + CREATE INDEX source_hostname_idx_agent_seed ON agent_seed (source_hostname,is_complete) + `, + ` + DROP INDEX start_timestamp_idx ON agent_seed + `, + ` + CREATE INDEX start_timestamp_idx_agent_seed ON agent_seed (start_timestamp) + `, + ` + DROP INDEX is_complete_idx ON agent_seed + `, + ` + CREATE INDEX is_complete_idx_agent_seed ON agent_seed (is_complete,start_timestamp) + `, + ` + DROP INDEX is_successful_idx ON agent_seed + `, + ` + CREATE INDEX is_successful_idx_agent_seed ON agent_seed (is_successful, start_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS agent_seed_state ( + agent_seed_state_id int(10) unsigned NOT NULL AUTO_INCREMENT, + agent_seed_id int(10) unsigned NOT NULL, + state_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + state_action varchar(127) NOT NULL, + error_message varchar(255) NOT NULL, + PRIMARY KEY (agent_seed_state_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX agent_seed_idx ON agent_seed_state + `, + ` + CREATE INDEX agent_seed_idx_agent_seed_state ON agent_seed_state (agent_seed_id, state_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS host_attributes ( + hostname varchar(128) NOT NULL, + attribute_name varchar(128) NOT NULL, + attribute_value varchar(128) NOT NULL, + submit_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + expire_timestamp timestamp NULL DEFAULT NULL, + PRIMARY KEY (hostname,attribute_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX attribute_name_idx ON host_attributes + `, + ` + CREATE INDEX attribute_name_idx_host_attributes ON host_attributes (attribute_name) + `, + ` + DROP INDEX attribute_value_idx ON host_attributes + `, + ` + CREATE INDEX attribute_value_idx_host_attributes ON host_attributes (attribute_value) + `, + ` + DROP INDEX submit_timestamp_idx ON host_attributes + `, + ` + CREATE INDEX submit_timestamp_idx_host_attributes ON host_attributes (submit_timestamp) + `, + ` + DROP INDEX expire_timestamp_idx ON host_attributes + `, + ` + CREATE INDEX expire_timestamp_idx_host_attributes ON host_attributes (expire_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS hostname_resolve ( + hostname varchar(128) NOT NULL, + resolved_hostname varchar(128) NOT NULL, + resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX resolved_timestamp_idx ON hostname_resolve + `, + ` + CREATE INDEX resolved_timestamp_idx_hostname_resolve ON hostname_resolve (resolved_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS cluster_alias ( + cluster_name varchar(128) CHARACTER SET ascii NOT NULL, + alias varchar(128) NOT NULL, + PRIMARY KEY (cluster_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS active_node ( + anchor tinyint unsigned NOT NULL, + hostname varchar(128) CHARACTER SET ascii NOT NULL, + token varchar(128) NOT NULL, + last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (anchor) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + INSERT IGNORE INTO active_node (anchor, hostname, token, last_seen_active) + VALUES (1, '', '', NOW()) + `, + ` + CREATE TABLE IF NOT EXISTS node_health ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + token varchar(128) NOT NULL, + last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname, token) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP VIEW IF EXISTS _whats_wrong + `, + ` + DROP VIEW IF EXISTS whats_wrong + `, + ` + DROP VIEW IF EXISTS whats_wrong_summary + `, + ` + CREATE TABLE IF NOT EXISTS topology_recovery ( + recovery_id bigint unsigned not null auto_increment, + hostname varchar(128) NOT NULL, + port smallint unsigned NOT NULL, + in_active_period tinyint unsigned NOT NULL DEFAULT 0, + start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + end_active_period_unixtime int unsigned, + end_recovery timestamp NULL DEFAULT NULL, + processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL, + processcing_node_token varchar(128) NOT NULL, + successor_hostname varchar(128) DEFAULT NULL, + successor_port smallint unsigned DEFAULT NULL, + PRIMARY KEY (recovery_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX in_active_start_period_idx ON topology_recovery + `, + ` + CREATE INDEX in_active_start_period_idx_topology_recovery ON topology_recovery (in_active_period, start_active_period) + `, + ` + DROP INDEX start_active_period_idx ON topology_recovery + `, + ` + CREATE INDEX start_active_period_idx_topology_recovery ON topology_recovery (start_active_period) + `, + ` + DROP INDEX hostname_port_active_period_uidx ON topology_recovery + `, + ` + CREATE UNIQUE INDEX hostname_port_active_period_uidx_topology_recovery ON topology_recovery (hostname, port, in_active_period, end_active_period_unixtime) + `, + ` + CREATE TABLE IF NOT EXISTS hostname_unresolve ( + hostname varchar(128) NOT NULL, + unresolved_hostname varchar(128) NOT NULL, + PRIMARY KEY (hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX unresolved_hostname_idx ON hostname_unresolve + `, + ` + CREATE INDEX unresolved_hostname_idx_hostname_unresolve ON hostname_unresolve (unresolved_hostname) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_pool ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + pool varchar(128) NOT NULL, + PRIMARY KEY (hostname, port, pool) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX pool_idx ON database_instance_pool + `, + ` + CREATE INDEX pool_idx_database_instance_pool ON database_instance_pool (pool) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_topology_history ( + snapshot_unix_timestamp INT UNSIGNED NOT NULL, + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + master_host varchar(128) CHARACTER SET ascii NOT NULL, + master_port smallint(5) unsigned NOT NULL, + cluster_name tinytext CHARACTER SET ascii NOT NULL, + PRIMARY KEY (snapshot_unix_timestamp, hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX cluster_name_idx ON database_instance_topology_history + `, + ` + CREATE INDEX cluster_name_idx_database_instance_topology_history ON database_instance_topology_history (snapshot_unix_timestamp, cluster_name(128)) + `, + ` + CREATE TABLE IF NOT EXISTS candidate_database_instance ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX last_suggested_idx ON candidate_database_instance + `, + ` + CREATE INDEX last_suggested_idx_candidate_database_instance ON candidate_database_instance (last_suggested) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_downtime ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + downtime_active tinyint(4) DEFAULT NULL, + begin_timestamp timestamp DEFAULT CURRENT_TIMESTAMP, + end_timestamp timestamp NULL DEFAULT NULL, + owner varchar(128) CHARACTER SET utf8 NOT NULL, + reason text CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS topology_failure_detection ( + detection_id bigint(20) unsigned NOT NULL AUTO_INCREMENT, + hostname varchar(128) NOT NULL, + port smallint unsigned NOT NULL, + in_active_period tinyint unsigned NOT NULL DEFAULT '0', + start_active_period timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + end_active_period_unixtime int unsigned NOT NULL, + processing_node_hostname varchar(128) NOT NULL, + processcing_node_token varchar(128) NOT NULL, + analysis varchar(128) NOT NULL, + cluster_name varchar(128) NOT NULL, + cluster_alias varchar(128) NOT NULL, + count_affected_slaves int unsigned NOT NULL, + slave_hosts text NOT NULL, + PRIMARY KEY (detection_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX hostname_port_active_period_uidx ON topology_failure_detection + `, + ` + DROP INDEX in_active_start_period_idx ON topology_failure_detection + `, + ` + CREATE INDEX in_active_start_period_idx_topology_failure_detection ON topology_failure_detection (in_active_period, start_active_period) + `, + ` + CREATE TABLE IF NOT EXISTS hostname_resolve_history ( + resolved_hostname varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + resolved_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (resolved_hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX hostname ON hostname_resolve_history + `, + ` + CREATE INDEX hostname_idx_hostname_resolve_history ON hostname_resolve_history (hostname) + `, + ` + DROP INDEX resolved_timestamp_idx ON hostname_resolve_history + `, + ` + CREATE INDEX resolved_timestamp_idx_hostname_resolve_history ON hostname_resolve_history (resolved_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS hostname_unresolve_history ( + unresolved_hostname varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (unresolved_hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX hostname ON hostname_unresolve_history + `, + ` + CREATE INDEX hostname_idx_hostname_unresolve_history ON hostname_unresolve_history (hostname) + `, + ` + DROP INDEX last_registered_idx ON hostname_unresolve_history + `, + ` + CREATE INDEX last_registered_idx_hostname_unresolve_history ON hostname_unresolve_history (last_registered) + `, + ` + CREATE TABLE IF NOT EXISTS cluster_domain_name ( + cluster_name varchar(128) CHARACTER SET ascii NOT NULL, + domain_name varchar(128) NOT NULL, + PRIMARY KEY (cluster_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX domain_name_idx ON cluster_domain_name + `, + ` + CREATE INDEX domain_name_idx_cluster_domain_name ON cluster_domain_name (domain_name(32)) + `, + ` + CREATE TABLE IF NOT EXISTS master_position_equivalence ( + equivalence_id bigint unsigned not null auto_increment, + master1_hostname varchar(128) CHARACTER SET ascii NOT NULL, + master1_port smallint(5) unsigned NOT NULL, + master1_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, + master1_binary_log_pos bigint(20) unsigned NOT NULL, + master2_hostname varchar(128) CHARACTER SET ascii NOT NULL, + master2_port smallint(5) unsigned NOT NULL, + master2_binary_log_file varchar(128) CHARACTER SET ascii NOT NULL, + master2_binary_log_pos bigint(20) unsigned NOT NULL, + last_suggested TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (equivalence_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX equivalence_uidx ON master_position_equivalence + `, + ` + CREATE UNIQUE INDEX equivalence_uidx_master_position_equivalence ON master_position_equivalence (master1_hostname, master1_port, master1_binary_log_file, master1_binary_log_pos, master2_hostname, master2_port) + `, + ` + DROP INDEX master2_idx ON master_position_equivalence + `, + ` + CREATE INDEX master2_idx_master_position_equivalence ON master_position_equivalence (master2_hostname, master2_port, master2_binary_log_file, master2_binary_log_pos) + `, + ` + DROP INDEX last_suggested_idx ON master_position_equivalence + `, + ` + CREATE INDEX last_suggested_idx_master_position_equivalence ON master_position_equivalence (last_suggested) + `, + ` + CREATE TABLE IF NOT EXISTS async_request ( + request_id bigint unsigned NOT NULL AUTO_INCREMENT, + command varchar(128) charset ascii not null, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + destination_hostname varchar(128) NOT NULL, + destination_port smallint(5) unsigned NOT NULL, + pattern text CHARACTER SET utf8 NOT NULL, + gtid_hint varchar(32) charset ascii not null, + begin_timestamp timestamp NULL DEFAULT NULL, + end_timestamp timestamp NULL DEFAULT NULL, + story text CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (request_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX begin_timestamp_idx ON async_request + `, + ` + CREATE INDEX begin_timestamp_idx_async_request ON async_request (begin_timestamp) + `, + ` + DROP INDEX end_timestamp_idx ON async_request + `, + ` + CREATE INDEX end_timestamp_idx_async_request ON async_request (end_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS blocked_topology_recovery ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + cluster_name varchar(128) NOT NULL, + analysis varchar(128) NOT NULL, + last_blocked_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + blocking_recovery_id bigint unsigned, + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX cluster_blocked_idx ON blocked_topology_recovery + `, + ` + CREATE INDEX cluster_blocked_idx_blocked_topology_recovery ON blocked_topology_recovery (cluster_name, last_blocked_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_last_analysis ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + analysis varchar(128) NOT NULL, + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX analysis_timestamp_idx ON database_instance_last_analysis + `, + ` + CREATE INDEX analysis_timestamp_idx_database_instance_last_analysis ON database_instance_last_analysis (analysis_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_analysis_changelog ( + changelog_id bigint unsigned not null auto_increment, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + analysis varchar(128) NOT NULL, + PRIMARY KEY (changelog_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX analysis_timestamp_idx ON database_instance_analysis_changelog + `, + ` + CREATE INDEX analysis_timestamp_idx_database_instance_analysis_changelog ON database_instance_analysis_changelog (analysis_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS node_health_history ( + history_id bigint unsigned not null auto_increment, + hostname varchar(128) CHARACTER SET ascii NOT NULL, + token varchar(128) NOT NULL, + first_seen_active timestamp NOT NULL, + extra_info varchar(128) CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (history_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX first_seen_active_idx ON node_health_history + `, + ` + CREATE INDEX first_seen_active_idx_node_health_history ON node_health_history (first_seen_active) + `, + ` + DROP INDEX hostname_token_idx ON node_health_history + `, + ` + CREATE UNIQUE INDEX hostname_token_idx_node_health_history ON node_health_history (hostname, token) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_coordinates_history ( + history_id bigint unsigned not null auto_increment, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + recorded_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint(20) unsigned NOT NULL, + relay_log_file varchar(128) NOT NULL, + relay_log_pos bigint(20) unsigned NOT NULL, + PRIMARY KEY (history_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX hostname_port_recorded_timestmp_idx ON database_instance_coordinates_history + `, + ` + CREATE INDEX hostname_port_recorded_idx_database_instance_coordinates_history ON database_instance_coordinates_history (hostname, port, recorded_timestamp) + `, + ` + DROP INDEX recorded_timestmp_idx ON database_instance_coordinates_history + `, + ` + CREATE INDEX recorded_timestmp_idx_database_instance_coordinates_history ON database_instance_coordinates_history (recorded_timestamp) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_binlog_files_history ( + history_id bigint unsigned not null auto_increment, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint(20) unsigned NOT NULL, + first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (history_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX hostname_port_file_idx ON database_instance_binlog_files_history + `, + ` + CREATE UNIQUE INDEX hostname_port_file_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (hostname, port, binary_log_file) + `, + ` + DROP INDEX last_seen_idx ON database_instance_binlog_files_history + `, + ` + CREATE INDEX last_seen_idx_database_instance_binlog_files_history ON database_instance_binlog_files_history (last_seen) + `, + ` + CREATE TABLE IF NOT EXISTS access_token ( + access_token_id bigint unsigned not null auto_increment, + public_token varchar(128) NOT NULL, + secret_token varchar(128) NOT NULL, + generated_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + generated_by varchar(128) CHARACTER SET utf8 NOT NULL, + is_acquired tinyint unsigned NOT NULL DEFAULT '0', + PRIMARY KEY (access_token_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX public_token_idx ON access_token + `, + ` + CREATE UNIQUE INDEX public_token_uidx_access_token ON access_token (public_token) + `, + ` + DROP INDEX generated_at_idx ON access_token + `, + ` + CREATE INDEX generated_at_idx_access_token ON access_token (generated_at) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_recent_relaylog_history ( + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + current_relay_log_file varchar(128) NOT NULL, + current_relay_log_pos bigint(20) unsigned NOT NULL, + current_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + prev_relay_log_file varchar(128) NOT NULL, + prev_relay_log_pos bigint(20) unsigned NOT NULL, + prev_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00', + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + DROP INDEX current_seen_idx ON database_instance_recent_relaylog_history + `, + ` + CREATE INDEX current_seen_idx_database_instance_recent_relaylog_history ON database_instance_recent_relaylog_history (current_seen) + `, + ` + CREATE TABLE IF NOT EXISTS orchestrator_metadata ( + anchor tinyint unsigned NOT NULL, + last_deployed_version varchar(128) CHARACTER SET ascii NOT NULL, + last_deployed_timestamp timestamp NOT NULL, + PRIMARY KEY (anchor) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS orchestrator_db_deployments ( + deployed_version varchar(128) CHARACTER SET ascii NOT NULL, + deployed_timestamp timestamp NOT NULL, + PRIMARY KEY (deployed_version) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS global_recovery_disable ( + disable_recovery tinyint unsigned NOT NULL COMMENT 'Insert 1 to disable recovery globally', + PRIMARY KEY (disable_recovery) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS cluster_alias_override ( + cluster_name varchar(128) CHARACTER SET ascii NOT NULL, + alias varchar(128) NOT NULL, + PRIMARY KEY (cluster_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS topology_recovery_steps ( + recovery_step_id bigint unsigned not null auto_increment, + recovery_uid varchar(128) CHARACTER SET ascii NOT NULL, + audit_at timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + message text CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (recovery_step_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS raft_store ( + store_id bigint unsigned not null auto_increment, + store_key varbinary(512) not null, + store_value blob not null, + PRIMARY KEY (store_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE INDEX store_key_idx_raft_store ON raft_store (store_key) + `, + ` + CREATE TABLE IF NOT EXISTS raft_log ( + log_index bigint unsigned not null auto_increment, + term bigint not null, + log_type int not null, + data blob not null, + PRIMARY KEY (log_index) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS raft_snapshot ( + snapshot_id bigint unsigned not null auto_increment, + snapshot_name varchar(128) CHARACTER SET utf8 NOT NULL, + snapshot_meta varchar(4096) CHARACTER SET utf8 NOT NULL, + PRIMARY KEY (snapshot_id) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE UNIQUE INDEX snapshot_name_uidx_raft_snapshot ON raft_snapshot (snapshot_name) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_peer_analysis ( + peer varchar(128) NOT NULL, + hostname varchar(128) NOT NULL, + port smallint(5) unsigned NOT NULL, + analysis_timestamp timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + analysis varchar(128) NOT NULL, + PRIMARY KEY (peer, hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_tls ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + required tinyint unsigned NOT NULL DEFAULT 0, + PRIMARY KEY (hostname,port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS kv_store ( + store_key varchar(255) CHARACTER SET ascii NOT NULL, + store_value text CHARACTER SET utf8 not null, + last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (store_key) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS cluster_injected_pseudo_gtid ( + cluster_name varchar(128) NOT NULL, + time_injected timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (cluster_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS hostname_ips ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + ipv4 varchar(128) CHARACTER SET ascii NOT NULL, + ipv6 varchar(128) CHARACTER SET ascii NOT NULL, + last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_tags ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + tag_name varchar(128) CHARACTER SET utf8 NOT NULL, + tag_value varchar(128) CHARACTER SET utf8 NOT NULL, + last_updated timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname, port, tag_name) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE INDEX tag_name_idx_database_instance_tags ON database_instance_tags (tag_name) + `, + ` + CREATE TABLE IF NOT EXISTS database_instance_stale_binlog_coordinates ( + hostname varchar(128) CHARACTER SET ascii NOT NULL, + port smallint(5) unsigned NOT NULL, + binary_log_file varchar(128) NOT NULL, + binary_log_pos bigint(20) unsigned NOT NULL, + first_seen timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP, + PRIMARY KEY (hostname, port) + ) ENGINE=InnoDB DEFAULT CHARSET=ascii + `, + ` + CREATE INDEX first_seen_idx_database_instance_stale_binlog_coordinates ON database_instance_stale_binlog_coordinates (first_seen) + `, +} diff --git a/go/vt/orchestrator/db/generate_patches.go b/go/vt/orchestrator/db/generate_patches.go new file mode 100644 index 0000000000..e1c198176d --- /dev/null +++ b/go/vt/orchestrator/db/generate_patches.go @@ -0,0 +1,620 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package db + +// generateSQLPatches contains DDLs for patching schema to the latest version. +// Add new statements at the end of the list so they form a changelog. +var generateSQLPatches = []string{ + ` + ALTER TABLE + database_instance + ADD COLUMN read_only TINYINT UNSIGNED NOT NULL AFTER version + `, + ` + ALTER TABLE + database_instance + ADD COLUMN last_sql_error TEXT NOT NULL AFTER exec_master_log_pos + `, + ` + ALTER TABLE + database_instance + ADD COLUMN last_io_error TEXT NOT NULL AFTER last_sql_error + `, + ` + ALTER TABLE + database_instance + ADD COLUMN oracle_gtid TINYINT UNSIGNED NOT NULL AFTER slave_io_running + `, + ` + ALTER TABLE + database_instance + ADD COLUMN mariadb_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN relay_log_file varchar(128) CHARACTER SET ascii NOT NULL AFTER exec_master_log_pos + `, + ` + ALTER TABLE + database_instance + ADD COLUMN relay_log_pos bigint unsigned NOT NULL AFTER relay_log_file + `, + ` + DROP INDEX master_host_port_idx ON database_instance + `, + ` + ALTER TABLE + database_instance + ADD INDEX master_host_port_idx_database_instance (master_host, master_port) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN pseudo_gtid TINYINT UNSIGNED NOT NULL AFTER mariadb_gtid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_depth TINYINT UNSIGNED NOT NULL AFTER cluster_name + `, + ` + ALTER TABLE + database_instance + ADD COLUMN has_replication_filters TINYINT UNSIGNED NOT NULL AFTER slave_io_running + `, + ` + ALTER TABLE + database_instance + ADD COLUMN data_center varchar(32) CHARACTER SET ascii NOT NULL AFTER cluster_name + `, + ` + ALTER TABLE + database_instance + ADD COLUMN physical_environment varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center + `, + ` + ALTER TABLE + database_instance_maintenance + ADD KEY active_timestamp_idx (maintenance_active, begin_timestamp) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN uptime INT UNSIGNED NOT NULL AFTER last_seen + `, + ` + ALTER TABLE + cluster_alias + ADD UNIQUE KEY alias_uidx (alias) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN is_co_master TINYINT UNSIGNED NOT NULL AFTER replication_depth + `, + ` + ALTER TABLE + database_instance_maintenance + ADD KEY active_end_timestamp_idx (maintenance_active, end_timestamp) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN sql_delay INT UNSIGNED NOT NULL AFTER slave_lag_seconds + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN analysis varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN cluster_alias varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN count_affected_slaves int unsigned NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN slave_hosts text CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE hostname_unresolve + ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE hostname_unresolve + ADD KEY last_registered_idx (last_registered) + `, + ` + ALTER TABLE topology_recovery + ADD KEY cluster_name_in_active_idx (cluster_name, in_active_period) + `, + ` + ALTER TABLE topology_recovery + ADD KEY end_recovery_idx (end_recovery) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN binlog_server TINYINT UNSIGNED NOT NULL AFTER version + `, + ` + ALTER TABLE cluster_domain_name + ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE cluster_domain_name + ADD KEY last_registered_idx (last_registered) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN supports_oracle_gtid TINYINT UNSIGNED NOT NULL AFTER oracle_gtid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN executed_gtid_set text CHARACTER SET ascii NOT NULL AFTER oracle_gtid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN server_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER server_id + `, + ` + ALTER TABLE + database_instance + ADD COLUMN suggested_cluster_alias varchar(128) CHARACTER SET ascii NOT NULL AFTER cluster_name + `, + ` + ALTER TABLE cluster_alias + ADD COLUMN last_registered TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE cluster_alias + ADD KEY last_registered_idx (last_registered) + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN is_successful TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER processcing_node_token + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN acknowledged TINYINT UNSIGNED NOT NULL DEFAULT 0 + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN acknowledged_by varchar(128) CHARACTER SET utf8 NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN acknowledge_comment text CHARACTER SET utf8 NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN participating_instances text CHARACTER SET ascii NOT NULL after slave_hosts + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN lost_slaves text CHARACTER SET ascii NOT NULL after participating_instances + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN all_errors text CHARACTER SET ascii NOT NULL after lost_slaves + `, + ` + ALTER TABLE audit + ADD COLUMN cluster_name varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER port + `, + ` + ALTER TABLE candidate_database_instance + ADD COLUMN priority TINYINT SIGNED NOT NULL DEFAULT 1 comment 'positive promote, nagative unpromotes' + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN acknowledged_at TIMESTAMP NULL after acknowledged + `, + ` + ALTER TABLE + topology_recovery + ADD KEY acknowledged_idx (acknowledged, acknowledged_at) + `, + ` + ALTER TABLE + blocked_topology_recovery + ADD KEY last_blocked_idx (last_blocked_timestamp) + `, + ` + ALTER TABLE candidate_database_instance + ADD COLUMN promotion_rule enum('must', 'prefer', 'neutral', 'prefer_not', 'must_not') NOT NULL DEFAULT 'neutral' + `, + ` + ALTER TABLE node_health /* sqlite3-skip */ + DROP PRIMARY KEY, + ADD PRIMARY KEY (hostname, token) + `, + ` + ALTER TABLE node_health + ADD COLUMN extra_info varchar(128) CHARACTER SET utf8 NOT NULL + `, + ` + ALTER TABLE agent_seed /* sqlite3-skip */ + MODIFY end_timestamp timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE active_node /* sqlite3-skip */ + MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + + ` + ALTER TABLE node_health /* sqlite3-skip */ + MODIFY last_seen_active timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE candidate_database_instance /* sqlite3-skip */ + MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE master_position_equivalence /* sqlite3-skip */ + MODIFY last_suggested timestamp NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE + database_instance + ADD COLUMN last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER last_checked + `, + ` + ALTER TABLE + database_instance /* sqlite3-skip */ + MODIFY last_attempted_check TIMESTAMP NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE + database_instance_analysis_changelog + ADD KEY instance_timestamp_idx (hostname, port, analysis_timestamp) + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN last_detection_id bigint unsigned NOT NULL + `, + ` + ALTER TABLE + topology_recovery + ADD KEY last_detection_idx (last_detection_id) + `, + ` + ALTER TABLE node_health_history + ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL + `, + ` + ALTER TABLE node_health + ADD COLUMN command varchar(128) CHARACTER SET utf8 NOT NULL + `, + ` + ALTER TABLE database_instance_topology_history + ADD COLUMN version varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN gtid_purged text CHARACTER SET ascii NOT NULL AFTER executed_gtid_set + `, + ` + ALTER TABLE + database_instance_coordinates_history + ADD COLUMN last_seen timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' AFTER recorded_timestamp + `, + ` + ALTER TABLE + access_token + ADD COLUMN is_reentrant TINYINT UNSIGNED NOT NULL default 0 + `, + ` + ALTER TABLE + access_token + ADD COLUMN acquired_at timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE + database_instance_pool + ADD COLUMN registered_at timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_credentials_available TINYINT UNSIGNED NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN has_replication_credentials TINYINT UNSIGNED NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN allow_tls TINYINT UNSIGNED NOT NULL AFTER sql_delay + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_enforced TINYINT UNSIGNED NOT NULL AFTER physical_environment + `, + ` + ALTER TABLE + database_instance + ADD COLUMN instance_alias varchar(128) CHARACTER SET ascii NOT NULL AFTER physical_environment + `, + ` + ALTER TABLE + topology_recovery + ADD COLUMN successor_alias varchar(128) DEFAULT NULL + `, + ` + ALTER TABLE + database_instance /* sqlite3-skip */ + MODIFY cluster_name varchar(128) NOT NULL + `, + ` + ALTER TABLE + node_health + ADD INDEX last_seen_active_idx (last_seen_active) + `, + ` + ALTER TABLE + database_instance_maintenance + ADD COLUMN processing_node_hostname varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + database_instance_maintenance + ADD COLUMN processing_node_token varchar(128) NOT NULL + `, + ` + ALTER TABLE + database_instance_maintenance + ADD COLUMN explicitly_bounded TINYINT UNSIGNED NOT NULL + `, + ` + ALTER TABLE node_health_history + ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" + `, + ` + ALTER TABLE node_health + ADD COLUMN app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" + `, + ` + ALTER TABLE node_health_history /* sqlite3-skip */ + MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" + `, + ` + ALTER TABLE node_health /* sqlite3-skip */ + MODIFY app_version varchar(64) CHARACTER SET ascii NOT NULL DEFAULT "" + `, + ` + ALTER TABLE + database_instance + ADD COLUMN version_comment varchar(128) NOT NULL DEFAULT '' + `, + ` + ALTER TABLE active_node + ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE node_health + ADD COLUMN first_seen_active timestamp NOT NULL DEFAULT '1971-01-01 00:00:00' + `, + ` + ALTER TABLE database_instance + ADD COLUMN major_version varchar(16) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN binlog_row_image varchar(16) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE topology_recovery + ADD COLUMN uid varchar(128) CHARACTER SET ascii NOT NULL + `, + ` + CREATE INDEX uid_idx_topology_recovery ON topology_recovery(uid) + `, + ` + CREATE INDEX recovery_uid_idx_topology_recovery_steps ON topology_recovery_steps(recovery_uid) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN last_discovery_latency bigint not null + `, + ` + CREATE INDEX end_timestamp_idx_database_instance_downtime ON database_instance_downtime(end_timestamp) + `, + ` + CREATE INDEX suggested_cluster_alias_idx_database_instance ON database_instance(suggested_cluster_alias) + `, + ` + ALTER TABLE + topology_failure_detection + ADD COLUMN is_actionable tinyint not null default 0 + `, + ` + DROP INDEX hostname_port_active_period_uidx_topology_failure_detection ON topology_failure_detection + `, + ` + CREATE UNIQUE INDEX host_port_active_recoverable_uidx_topology_failure_detection ON topology_failure_detection (hostname, port, in_active_period, end_active_period_unixtime, is_actionable) + `, + ` + ALTER TABLE raft_snapshot + ADD COLUMN created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP + `, + ` + ALTER TABLE node_health + ADD COLUMN db_backend varchar(255) CHARACTER SET ascii NOT NULL DEFAULT "" + `, + ` + ALTER TABLE node_health + ADD COLUMN incrementing_indicator bigint not null default 0 + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_master_enabled TINYINT UNSIGNED NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_replica_enabled TINYINT UNSIGNED NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN gtid_mode varchar(32) CHARACTER SET ascii NOT NULL + `, + ` + ALTER TABLE + database_instance + ADD COLUMN last_check_partial_success tinyint unsigned NOT NULL after last_attempted_check + `, + ` + ALTER TABLE + database_instance + ADD COLUMN master_uuid varchar(64) CHARACTER SET ascii NOT NULL AFTER oracle_gtid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN gtid_errant text CHARACTER SET ascii NOT NULL AFTER gtid_purged + `, + ` + ALTER TABLE + database_instance + ADD COLUMN ancestry_uuid text CHARACTER SET ascii NOT NULL AFTER master_uuid + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_sql_thread_state tinyint signed not null default 0 AFTER slave_io_running + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_io_thread_state tinyint signed not null default 0 AFTER replication_sql_thread_state + `, + ` + ALTER TABLE + database_instance_tags /* sqlite3-skip */ + DROP PRIMARY KEY, + ADD PRIMARY KEY (hostname, port, tag_name) + `, + ` + ALTER TABLE + database_instance + ADD COLUMN region varchar(32) CHARACTER SET ascii NOT NULL AFTER data_center + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_master_timeout INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_master_enabled + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_master_wait_for_slave_count INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_master_timeout + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_master_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_master_wait_for_slave_count + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_replica_status TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_master_status + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_master_clients INT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_master_status + `, + ` + ALTER TABLE + database_instance + ADD COLUMN semi_sync_available TINYINT UNSIGNED NOT NULL DEFAULT 0 AFTER semi_sync_enforced + `, + ` + ALTER TABLE /* sqlite3-skip */ + database_instance + MODIFY semi_sync_master_timeout BIGINT UNSIGNED NOT NULL DEFAULT 0 + `, + // Fields related to Replication Group the instance belongs to + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_name VARCHAR(64) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER gtid_mode + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_is_single_primary_mode TINYINT UNSIGNED NOT NULL DEFAULT 1 AFTER replication_group_name + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_member_state VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_is_single_primary_mode + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_member_role VARCHAR(16) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_member_state + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_members text CHARACTER SET ascii NOT NULL AFTER replication_group_member_role + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_primary_host varchar(128) CHARACTER SET ascii NOT NULL DEFAULT '' AFTER replication_group_members + `, + ` + ALTER TABLE + database_instance + ADD COLUMN replication_group_primary_port smallint(5) unsigned NOT NULL DEFAULT 0 AFTER replication_group_primary_host + `, +} diff --git a/go/vt/orchestrator/db/tls.go b/go/vt/orchestrator/db/tls.go new file mode 100644 index 0000000000..fc38445415 --- /dev/null +++ b/go/vt/orchestrator/db/tls.go @@ -0,0 +1,146 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package db + +import ( + "crypto/tls" + "fmt" + "strings" + "time" + + "github.com/go-sql-driver/mysql" + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/ssl" +) + +const Error3159 = "Error 3159:" +const Error1045 = "Access denied for user" + +// Track if a TLS has already been configured for topology +var topologyTLSConfigured bool = false + +// Track if a TLS has already been configured for Orchestrator +var orchestratorTLSConfigured bool = false + +var requireTLSCache *cache.Cache = cache.New(time.Duration(config.Config.TLSCacheTTLFactor*config.Config.InstancePollSeconds)*time.Second, time.Second) + +var readInstanceTLSCounter = metrics.NewCounter() +var writeInstanceTLSCounter = metrics.NewCounter() +var readInstanceTLSCacheCounter = metrics.NewCounter() +var writeInstanceTLSCacheCounter = metrics.NewCounter() + +func init() { + metrics.Register("instance_tls.read", readInstanceTLSCounter) + metrics.Register("instance_tls.write", writeInstanceTLSCounter) + metrics.Register("instance_tls.read_cache", readInstanceTLSCacheCounter) + metrics.Register("instance_tls.write_cache", writeInstanceTLSCacheCounter) +} + +func requiresTLS(host string, port int, mysql_uri string) bool { + cacheKey := fmt.Sprintf("%s:%d", host, port) + + if value, found := requireTLSCache.Get(cacheKey); found { + readInstanceTLSCacheCounter.Inc(1) + return value.(bool) + } + + required := false + db, _, _ := sqlutils.GetDB(mysql_uri) + if err := db.Ping(); err != nil && (strings.Contains(err.Error(), Error3159) || strings.Contains(err.Error(), Error1045)) { + required = true + } + + query := ` + insert into + database_instance_tls ( + hostname, port, required + ) values ( + ?, ?, ? + ) + on duplicate key update + required=values(required) + ` + if _, err := ExecOrchestrator(query, host, port, required); err != nil { + log.Errore(err) + } + writeInstanceTLSCounter.Inc(1) + + requireTLSCache.Set(cacheKey, required, cache.DefaultExpiration) + writeInstanceTLSCacheCounter.Inc(1) + + return required +} + +// Create a TLS configuration from the config supplied CA, Certificate, and Private key. +// Register the TLS config with the mysql drivers as the "topology" config +// Modify the supplied URI to call the TLS config +func SetupMySQLTopologyTLS(uri string) (string, error) { + if !topologyTLSConfigured { + tlsConfig, err := ssl.NewTLSConfig(config.Config.MySQLTopologySSLCAFile, !config.Config.MySQLTopologySSLSkipVerify) + // Drop to TLS 1.0 for talking to MySQL + tlsConfig.MinVersion = tls.VersionTLS10 + if err != nil { + return "", log.Errorf("Can't create TLS configuration for Topology connection %s: %s", uri, err) + } + tlsConfig.InsecureSkipVerify = config.Config.MySQLTopologySSLSkipVerify + + if (config.Config.MySQLTopologyUseMutualTLS && !config.Config.MySQLTopologySSLSkipVerify) && + config.Config.MySQLTopologySSLCertFile != "" && + config.Config.MySQLTopologySSLPrivateKeyFile != "" { + if err = ssl.AppendKeyPair(tlsConfig, config.Config.MySQLTopologySSLCertFile, config.Config.MySQLTopologySSLPrivateKeyFile); err != nil { + return "", log.Errorf("Can't setup TLS key pairs for %s: %s", uri, err) + } + } + if err = mysql.RegisterTLSConfig("topology", tlsConfig); err != nil { + return "", log.Errorf("Can't register mysql TLS config for topology: %s", err) + } + topologyTLSConfigured = true + } + return fmt.Sprintf("%s&tls=topology", uri), nil +} + +// Create a TLS configuration from the config supplied CA, Certificate, and Private key. +// Register the TLS config with the mysql drivers as the "orchestrator" config +// Modify the supplied URI to call the TLS config +func SetupMySQLOrchestratorTLS(uri string) (string, error) { + if !orchestratorTLSConfigured { + tlsConfig, err := ssl.NewTLSConfig(config.Config.MySQLOrchestratorSSLCAFile, !config.Config.MySQLOrchestratorSSLSkipVerify) + // Drop to TLS 1.0 for talking to MySQL + tlsConfig.MinVersion = tls.VersionTLS10 + if err != nil { + return "", log.Fatalf("Can't create TLS configuration for Orchestrator connection %s: %s", uri, err) + } + tlsConfig.InsecureSkipVerify = config.Config.MySQLOrchestratorSSLSkipVerify + if (!config.Config.MySQLOrchestratorSSLSkipVerify) && + config.Config.MySQLOrchestratorSSLCertFile != "" && + config.Config.MySQLOrchestratorSSLPrivateKeyFile != "" { + if err = ssl.AppendKeyPair(tlsConfig, config.Config.MySQLOrchestratorSSLCertFile, config.Config.MySQLOrchestratorSSLPrivateKeyFile); err != nil { + return "", log.Fatalf("Can't setup TLS key pairs for %s: %s", uri, err) + } + } + if err = mysql.RegisterTLSConfig("orchestrator", tlsConfig); err != nil { + return "", log.Fatalf("Can't register mysql TLS config for orchestrator: %s", err) + } + orchestratorTLSConfigured = true + } + return fmt.Sprintf("%s&tls=orchestrator", uri), nil +} diff --git a/go/vt/orchestrator/discovery/aggregated.go b/go/vt/orchestrator/discovery/aggregated.go new file mode 100644 index 0000000000..7f9d9225ba --- /dev/null +++ b/go/vt/orchestrator/discovery/aggregated.go @@ -0,0 +1,200 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package discovery + +import ( + "time" + + "github.com/montanaflynn/stats" + + "vitess.io/vitess/go/vt/orchestrator/collection" +) + +// AggregatedDiscoveryMetrics contains aggregated metrics for instance discovery. +// Called from api/discovery-metrics-aggregated/:seconds +type AggregatedDiscoveryMetrics struct { + FirstSeen time.Time // timestamp of the first data seen + LastSeen time.Time // timestamp of the last data seen + CountDistinctInstanceKeys int // number of distinct Instances seen (note: this may not be true: distinct = succeeded + failed) + CountDistinctOkInstanceKeys int // number of distinct Instances which succeeded + CountDistinctFailedInstanceKeys int // number of distinct Instances which failed + FailedDiscoveries uint64 // number of failed discoveries + SuccessfulDiscoveries uint64 // number of successful discoveries + MeanTotalSeconds float64 + MeanBackendSeconds float64 + MeanInstanceSeconds float64 + FailedMeanTotalSeconds float64 + FailedMeanBackendSeconds float64 + FailedMeanInstanceSeconds float64 + MaxTotalSeconds float64 + MaxBackendSeconds float64 + MaxInstanceSeconds float64 + FailedMaxTotalSeconds float64 + FailedMaxBackendSeconds float64 + FailedMaxInstanceSeconds float64 + MedianTotalSeconds float64 + MedianBackendSeconds float64 + MedianInstanceSeconds float64 + FailedMedianTotalSeconds float64 + FailedMedianBackendSeconds float64 + FailedMedianInstanceSeconds float64 + P95TotalSeconds float64 + P95BackendSeconds float64 + P95InstanceSeconds float64 + FailedP95TotalSeconds float64 + FailedP95BackendSeconds float64 + FailedP95InstanceSeconds float64 +} + +// aggregate returns the aggregate values of the given metrics (assumed to be Metric) +func aggregate(results []collection.Metric) AggregatedDiscoveryMetrics { + if len(results) == 0 { + return AggregatedDiscoveryMetrics{} + } + + var ( + first time.Time + last time.Time + ) + + type counterKey string + type hostKey string + type timerKey string + const ( + FailedDiscoveries counterKey = "FailedDiscoveries" + Discoveries = "Discoveries" + InstanceKeys hostKey = "InstanceKeys" + OkInstanceKeys = "OkInstanceKeys" + FailedInstanceKeys = "FailedInstanceKeys" + TotalSeconds timerKey = "TotalSeconds" + BackendSeconds = "BackendSeconds" + InstanceSeconds = "InstanceSeconds" + FailedTotalSeconds = "FailedTotalSeconds" + FailedBackendSeconds = "FailedBackendSeconds" + FailedInstanceSeconds = "FailedInstanceSeconds" + ) + + counters := make(map[counterKey]uint64) // map of string based counters + names := make(map[hostKey](map[string]int)) // map of string based names (using a map) + timings := make(map[timerKey](stats.Float64Data)) // map of string based float64 values + + // initialise counters + for _, v := range []counterKey{FailedDiscoveries, Discoveries} { + counters[v] = 0 + } + // initialise names + for _, v := range []hostKey{InstanceKeys, FailedInstanceKeys, OkInstanceKeys} { + names[v] = make(map[string]int) + } + // initialise timers + for _, v := range []timerKey{TotalSeconds, BackendSeconds, InstanceSeconds, FailedTotalSeconds, FailedBackendSeconds, FailedInstanceSeconds} { + timings[v] = nil + } + + // iterate over results storing required values + for _, v2 := range results { + v := v2.(*Metric) // convert to the right type + + // first and last + if first.IsZero() || first.After(v.Timestamp) { + first = v.Timestamp + } + if last.Before(v.Timestamp) { + last = v.Timestamp + } + + // different names + x := names[InstanceKeys] + x[v.InstanceKey.String()] = 1 // Value doesn't matter + names[InstanceKeys] = x + + if v.Err == nil { + // ok names + x := names[OkInstanceKeys] + x[v.InstanceKey.String()] = 1 // Value doesn't matter + names[OkInstanceKeys] = x + } else { + // failed names + x := names[FailedInstanceKeys] + x[v.InstanceKey.String()] = 1 // Value doesn't matter + names[FailedInstanceKeys] = x + } + + // discoveries + counters[Discoveries]++ + if v.Err != nil { + counters[FailedDiscoveries]++ + } + + // All timings + timings[TotalSeconds] = append(timings[TotalSeconds], v.TotalLatency.Seconds()) + timings[BackendSeconds] = append(timings[BackendSeconds], v.BackendLatency.Seconds()) + timings[InstanceSeconds] = append(timings[InstanceSeconds], v.InstanceLatency.Seconds()) + + // Failed timings + if v.Err != nil { + timings[FailedTotalSeconds] = append(timings[FailedTotalSeconds], v.TotalLatency.Seconds()) + timings[FailedBackendSeconds] = append(timings[FailedBackendSeconds], v.BackendLatency.Seconds()) + timings[FailedInstanceSeconds] = append(timings[FailedInstanceSeconds], v.InstanceLatency.Seconds()) + } + } + + return AggregatedDiscoveryMetrics{ + FirstSeen: first, + LastSeen: last, + CountDistinctInstanceKeys: len(names[InstanceKeys]), + CountDistinctOkInstanceKeys: len(names[OkInstanceKeys]), + CountDistinctFailedInstanceKeys: len(names[FailedInstanceKeys]), + FailedDiscoveries: counters[FailedDiscoveries], + SuccessfulDiscoveries: counters[Discoveries], + MeanTotalSeconds: mean(timings[TotalSeconds]), + MeanBackendSeconds: mean(timings[BackendSeconds]), + MeanInstanceSeconds: mean(timings[InstanceSeconds]), + FailedMeanTotalSeconds: mean(timings[FailedTotalSeconds]), + FailedMeanBackendSeconds: mean(timings[FailedBackendSeconds]), + FailedMeanInstanceSeconds: mean(timings[FailedInstanceSeconds]), + MaxTotalSeconds: max(timings[TotalSeconds]), + MaxBackendSeconds: max(timings[BackendSeconds]), + MaxInstanceSeconds: max(timings[InstanceSeconds]), + FailedMaxTotalSeconds: max(timings[FailedTotalSeconds]), + FailedMaxBackendSeconds: max(timings[FailedBackendSeconds]), + FailedMaxInstanceSeconds: max(timings[FailedInstanceSeconds]), + MedianTotalSeconds: median(timings[TotalSeconds]), + MedianBackendSeconds: median(timings[BackendSeconds]), + MedianInstanceSeconds: median(timings[InstanceSeconds]), + FailedMedianTotalSeconds: median(timings[FailedTotalSeconds]), + FailedMedianBackendSeconds: median(timings[FailedBackendSeconds]), + FailedMedianInstanceSeconds: median(timings[FailedInstanceSeconds]), + P95TotalSeconds: percentile(timings[TotalSeconds], 95), + P95BackendSeconds: percentile(timings[BackendSeconds], 95), + P95InstanceSeconds: percentile(timings[InstanceSeconds], 95), + FailedP95TotalSeconds: percentile(timings[FailedTotalSeconds], 95), + FailedP95BackendSeconds: percentile(timings[FailedBackendSeconds], 95), + FailedP95InstanceSeconds: percentile(timings[FailedInstanceSeconds], 95), + } +} + +// AggregatedSince returns a large number of aggregated metrics +// based on the raw metrics collected since the given time. +func AggregatedSince(c *collection.Collection, t time.Time) (AggregatedDiscoveryMetrics, error) { + results, err := c.Since(t) + if err != nil { + return AggregatedDiscoveryMetrics{}, err + } + + return aggregate(results), nil +} diff --git a/go/vt/orchestrator/discovery/funcs.go b/go/vt/orchestrator/discovery/funcs.go new file mode 100644 index 0000000000..e468d10a42 --- /dev/null +++ b/go/vt/orchestrator/discovery/funcs.go @@ -0,0 +1,66 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package discovery + +import ( + "github.com/montanaflynn/stats" +) + +// internal routine to return the average value or 0 +func mean(values stats.Float64Data) float64 { + s, err := stats.Mean(values) + if err != nil { + return 0 + } + return s +} + +// internal routine to return the requested percentile value or 0 +func percentile(values stats.Float64Data, percent float64) float64 { + s, err := stats.Percentile(values, percent) + if err != nil { + return 0 + } + return s +} + +// internal routine to return the maximum value or 0 +func max(values stats.Float64Data) float64 { + s, err := stats.Max(values) + if err != nil { + return 0 + } + return s +} + +// internal routine to return the minimum value or 9e9 +func min(values stats.Float64Data) float64 { + s, err := stats.Min(values) + if err != nil { + return 9e9 // a large number (should use something better than this but it's ok for now) + } + return s +} + +// internal routine to return the median or 0 +func median(values stats.Float64Data) float64 { + s, err := stats.Median(values) + if err != nil { + return 0 + } + return s +} diff --git a/go/vt/orchestrator/discovery/metric.go b/go/vt/orchestrator/discovery/metric.go new file mode 100644 index 0000000000..ba99693850 --- /dev/null +++ b/go/vt/orchestrator/discovery/metric.go @@ -0,0 +1,40 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package discovery + +// Collect discovery metrics and manage their storage and retrieval for monitoring purposes. + +import ( + "time" + + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +// Metric holds a set of information of instance discovery metrics +type Metric struct { + Timestamp time.Time // time the collection was taken + InstanceKey inst.InstanceKey // instance being monitored + BackendLatency time.Duration // time taken talking to the backend + InstanceLatency time.Duration // time taken talking to the instance + TotalLatency time.Duration // total time taken doing the discovery + Err error // error (if applicable) doing the discovery process +} + +// When did the metric happen +func (m Metric) When() time.Time { + return m.Timestamp +} diff --git a/go/vt/orchestrator/discovery/metric_json.go b/go/vt/orchestrator/discovery/metric_json.go new file mode 100644 index 0000000000..0019e78b29 --- /dev/null +++ b/go/vt/orchestrator/discovery/metric_json.go @@ -0,0 +1,74 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package discovery + +// Collect discovery metrics and manage their storage and retrieval for monitoring purposes. + +import ( + "errors" + "fmt" + "time" + + "vitess.io/vitess/go/vt/orchestrator/collection" +) + +// formattedFloat is to force the JSON output to show 3 decimal places +type formattedFloat float64 + +func (m formattedFloat) String() string { + return fmt.Sprintf("%.3f", m) +} + +// MetricJSON holds a structure which represents some discovery latency information +type MetricJSON struct { + Timestamp time.Time + Hostname string + Port int + BackendLatencySeconds formattedFloat + InstanceLatencySeconds formattedFloat + TotalLatencySeconds formattedFloat + Err error +} + +// JSONSince returns an API response of discovery metric collection information +// in a printable JSON format. +func JSONSince(c *collection.Collection, t time.Time) ([](MetricJSON), error) { + if c == nil { + return nil, errors.New("MetricCollection.JSONSince: c == nil") + } + raw, err := c.Since(t) + if err != nil { + return nil, err + } + + // build up JSON response for each Metric we received + var s []MetricJSON + for i := range raw { + m := raw[i].(*Metric) // convert back to a real Metric rather than collection.Metric interface + mj := MetricJSON{ + Timestamp: m.Timestamp, + Hostname: m.InstanceKey.Hostname, + Port: m.InstanceKey.Port, + BackendLatencySeconds: formattedFloat(m.BackendLatency.Seconds()), + InstanceLatencySeconds: formattedFloat(m.InstanceLatency.Seconds()), + TotalLatencySeconds: formattedFloat(m.TotalLatency.Seconds()), + Err: m.Err, + } + s = append(s, mj) + } + return s, nil +} diff --git a/go/vt/orchestrator/discovery/queue.go b/go/vt/orchestrator/discovery/queue.go new file mode 100644 index 0000000000..e43829213f --- /dev/null +++ b/go/vt/orchestrator/discovery/queue.go @@ -0,0 +1,187 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +/* + +package discovery manages a queue of discovery requests: an ordered +queue with no duplicates. + +push() operation never blocks while pop() blocks on an empty queue. + +*/ + +package discovery + +import ( + "sync" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +// QueueMetric contains the queue's active and queued sizes +type QueueMetric struct { + Active int + Queued int +} + +// Queue contains information for managing discovery requests +type Queue struct { + sync.Mutex + + name string + done chan struct{} + queue chan inst.InstanceKey + queuedKeys map[inst.InstanceKey]time.Time + consumedKeys map[inst.InstanceKey]time.Time + metrics []QueueMetric +} + +// DiscoveryQueue contains the discovery queue which can then be accessed via an API call for monitoring. +// Currently this is accessed by ContinuousDiscovery() but also from http api calls. +// I may need to protect this better? +var discoveryQueue map[string](*Queue) +var dcLock sync.Mutex + +func init() { + discoveryQueue = make(map[string](*Queue)) +} + +// StopMonitoring stops monitoring all the queues +func StopMonitoring() { + for _, q := range discoveryQueue { + q.stopMonitoring() + } +} + +// CreateOrReturnQueue allows for creation of a new discovery queue or +// returning a pointer to an existing one given the name. +func CreateOrReturnQueue(name string) *Queue { + dcLock.Lock() + defer dcLock.Unlock() + if q, found := discoveryQueue[name]; found { + return q + } + + q := &Queue{ + name: name, + queuedKeys: make(map[inst.InstanceKey]time.Time), + consumedKeys: make(map[inst.InstanceKey]time.Time), + queue: make(chan inst.InstanceKey, config.Config.DiscoveryQueueCapacity), + } + go q.startMonitoring() + + discoveryQueue[name] = q + + return q +} + +// monitoring queue sizes until we are told to stop +func (q *Queue) startMonitoring() { + log.Debugf("Queue.startMonitoring(%s)", q.name) + ticker := time.NewTicker(time.Second) // hard-coded at every second + + for { + select { + case <-ticker.C: // do the periodic expiry + q.collectStatistics() + case <-q.done: + return + } + } +} + +// Stop monitoring the queue +func (q *Queue) stopMonitoring() { + q.done <- struct{}{} +} + +// do a check of the entries in the queue, both those active and queued +func (q *Queue) collectStatistics() { + q.Lock() + defer q.Unlock() + + q.metrics = append(q.metrics, QueueMetric{Queued: len(q.queuedKeys), Active: len(q.consumedKeys)}) + + // remove old entries if we get too big + if len(q.metrics) > config.Config.DiscoveryQueueMaxStatisticsSize { + q.metrics = q.metrics[len(q.metrics)-config.Config.DiscoveryQueueMaxStatisticsSize:] + } +} + +// QueueLen returns the length of the queue (channel size + queued size) +func (q *Queue) QueueLen() int { + q.Lock() + defer q.Unlock() + + return len(q.queue) + len(q.queuedKeys) +} + +// Push enqueues a key if it is not on a queue and is not being +// processed; silently returns otherwise. +func (q *Queue) Push(key inst.InstanceKey) { + q.Lock() + defer q.Unlock() + + // is it enqueued already? + if _, found := q.queuedKeys[key]; found { + return + } + + // is it being processed now? + if _, found := q.consumedKeys[key]; found { + return + } + + q.queuedKeys[key] = time.Now() + q.queue <- key +} + +// Consume fetches a key to process; blocks if queue is empty. +// Release must be called once after Consume. +func (q *Queue) Consume() inst.InstanceKey { + q.Lock() + queue := q.queue + q.Unlock() + + key := <-queue + + q.Lock() + defer q.Unlock() + + // alarm if have been waiting for too long + timeOnQueue := time.Since(q.queuedKeys[key]) + if timeOnQueue > time.Duration(config.Config.InstancePollSeconds)*time.Second { + log.Warningf("key %v spent %.4fs waiting on a discoveryQueue", key, timeOnQueue.Seconds()) + } + + q.consumedKeys[key] = q.queuedKeys[key] + + delete(q.queuedKeys, key) + + return key +} + +// Release removes a key from a list of being processed keys +// which allows that key to be pushed into the queue again. +func (q *Queue) Release(key inst.InstanceKey) { + q.Lock() + defer q.Unlock() + + delete(q.consumedKeys, key) +} diff --git a/go/vt/orchestrator/discovery/queue_aggregated_stats.go b/go/vt/orchestrator/discovery/queue_aggregated_stats.go new file mode 100644 index 0000000000..d9e103a652 --- /dev/null +++ b/go/vt/orchestrator/discovery/queue_aggregated_stats.go @@ -0,0 +1,95 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package discovery + +import ( + "github.com/montanaflynn/stats" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// AggregatedQueueMetrics contains aggregate information some part queue metrics +type AggregatedQueueMetrics struct { + ActiveMinEntries float64 + ActiveMeanEntries float64 + ActiveMedianEntries float64 + ActiveP95Entries float64 + ActiveMaxEntries float64 + QueuedMinEntries float64 + QueuedMeanEntries float64 + QueuedMedianEntries float64 + QueuedP95Entries float64 + QueuedMaxEntries float64 +} + +// we pull out values in ints so convert to float64 for metric calculations +func intSliceToFloat64Slice(someInts []int) stats.Float64Data { + var slice stats.Float64Data + + for _, v := range someInts { + slice = append(slice, float64(v)) + } + + return slice +} + +// DiscoveryQueueMetrics returns some raw queue metrics based on the +// period (last N entries) requested. +func (q *Queue) DiscoveryQueueMetrics(period int) []QueueMetric { + q.Lock() + defer q.Unlock() + + // adjust period in case we ask for something that's too long + if period > len(q.metrics) { + log.Debugf("DiscoveryQueueMetrics: wanted: %d, adjusting period to %d", period, len(q.metrics)) + period = len(q.metrics) + } + + a := q.metrics[len(q.metrics)-period:] + log.Debugf("DiscoveryQueueMetrics: returning values: %+v", a) + return a +} + +// AggregatedDiscoveryQueueMetrics Returns some aggregate statistics +// based on the period (last N entries) requested. We store up to +// config.Config.DiscoveryQueueMaxStatisticsSize values and collect once +// a second so we expect period to be a smaller value. +func (q *Queue) AggregatedDiscoveryQueueMetrics(period int) *AggregatedQueueMetrics { + wanted := q.DiscoveryQueueMetrics(period) + + var activeEntries, queuedEntries []int + // fill vars + for i := range wanted { + activeEntries = append(activeEntries, wanted[i].Active) + queuedEntries = append(queuedEntries, wanted[i].Queued) + } + + a := &AggregatedQueueMetrics{ + ActiveMinEntries: min(intSliceToFloat64Slice(activeEntries)), + ActiveMeanEntries: mean(intSliceToFloat64Slice(activeEntries)), + ActiveMedianEntries: median(intSliceToFloat64Slice(activeEntries)), + ActiveP95Entries: percentile(intSliceToFloat64Slice(activeEntries), 95), + ActiveMaxEntries: max(intSliceToFloat64Slice(activeEntries)), + QueuedMinEntries: min(intSliceToFloat64Slice(queuedEntries)), + QueuedMeanEntries: mean(intSliceToFloat64Slice(queuedEntries)), + QueuedMedianEntries: median(intSliceToFloat64Slice(queuedEntries)), + QueuedP95Entries: percentile(intSliceToFloat64Slice(queuedEntries), 95), + QueuedMaxEntries: max(intSliceToFloat64Slice(queuedEntries)), + } + log.Debugf("AggregatedDiscoveryQueueMetrics: returning values: %+v", a) + return a +} diff --git a/go/vt/orchestrator/external/golib/README.md b/go/vt/orchestrator/external/golib/README.md new file mode 100644 index 0000000000..ee0907e945 --- /dev/null +++ b/go/vt/orchestrator/external/golib/README.md @@ -0,0 +1,9 @@ +Common Go libraries + +To import & use: +``` +go get "github.com/openark/golib/math" +go get "github.com/openark/golib/sqlutils" +go get "github.com/openark/golib/tests" +... +``` diff --git a/go/vt/orchestrator/external/golib/log/log.go b/go/vt/orchestrator/external/golib/log/log.go new file mode 100644 index 0000000000..26d33c9973 --- /dev/null +++ b/go/vt/orchestrator/external/golib/log/log.go @@ -0,0 +1,268 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package log + +import ( + "errors" + "fmt" + "log/syslog" + "os" + "runtime/debug" + "time" +) + +// LogLevel indicates the severity of a log entry +type LogLevel int + +func (this LogLevel) String() string { + switch this { + case FATAL: + return "FATAL" + case CRITICAL: + return "CRITICAL" + case ERROR: + return "ERROR" + case WARNING: + return "WARNING" + case NOTICE: + return "NOTICE" + case INFO: + return "INFO" + case DEBUG: + return "DEBUG" + } + return "unknown" +} + +func LogLevelFromString(logLevelName string) (LogLevel, error) { + switch logLevelName { + case "FATAL": + return FATAL, nil + case "CRITICAL": + return CRITICAL, nil + case "ERROR": + return ERROR, nil + case "WARNING": + return WARNING, nil + case "NOTICE": + return NOTICE, nil + case "INFO": + return INFO, nil + case "DEBUG": + return DEBUG, nil + } + return 0, fmt.Errorf("Unknown LogLevel name: %+v", logLevelName) +} + +const ( + FATAL LogLevel = iota + CRITICAL + ERROR + WARNING + NOTICE + INFO + DEBUG +) + +const TimeFormat = "2006-01-02 15:04:05" + +// globalLogLevel indicates the global level filter for all logs (only entries with level equals or higher +// than this value will be logged) +var globalLogLevel LogLevel = DEBUG +var printStackTrace bool = false + +// syslogWriter is optional, and defaults to nil (disabled) +var syslogLevel LogLevel = ERROR +var syslogWriter *syslog.Writer + +// SetPrintStackTrace enables/disables dumping the stack upon error logging +func SetPrintStackTrace(shouldPrintStackTrace bool) { + printStackTrace = shouldPrintStackTrace +} + +// SetLevel sets the global log level. Only entries with level equals or higher than +// this value will be logged +func SetLevel(logLevel LogLevel) { + globalLogLevel = logLevel +} + +// GetLevel returns current global log level +func GetLevel() LogLevel { + return globalLogLevel +} + +// EnableSyslogWriter enables, if possible, writes to syslog. These will execute _in addition_ to normal logging +func EnableSyslogWriter(tag string) (err error) { + syslogWriter, err = syslog.New(syslog.LOG_ERR, tag) + if err != nil { + syslogWriter = nil + } + return err +} + +// SetSyslogLevel sets the minimal syslog level. Only entries with level equals or higher than +// this value will be logged. However, this is also capped by the global log level. That is, +// messages with lower level than global-log-level will be discarded at any case. +func SetSyslogLevel(logLevel LogLevel) { + syslogLevel = logLevel +} + +// logFormattedEntry nicely formats and emits a log entry +func logFormattedEntry(logLevel LogLevel, message string, args ...interface{}) string { + if logLevel > globalLogLevel { + return "" + } + // if TZ env variable is set, update the timestamp timezone + localizedTime := time.Now() + tzLocation := os.Getenv("TZ") + if tzLocation != "" { + location, err := time.LoadLocation(tzLocation) + if err == nil { // if invalid tz location was provided, just leave it as the default + localizedTime = time.Now().In(location) + } + } + + msgArgs := fmt.Sprintf(message, args...) + entryString := fmt.Sprintf("%s %s %s", localizedTime.Format(TimeFormat), logLevel, msgArgs) + fmt.Fprintln(os.Stderr, entryString) + + if syslogWriter != nil { + go func() error { + if logLevel > syslogLevel { + return nil + } + switch logLevel { + case FATAL: + return syslogWriter.Emerg(msgArgs) + case CRITICAL: + return syslogWriter.Crit(msgArgs) + case ERROR: + return syslogWriter.Err(msgArgs) + case WARNING: + return syslogWriter.Warning(msgArgs) + case NOTICE: + return syslogWriter.Notice(msgArgs) + case INFO: + return syslogWriter.Info(msgArgs) + case DEBUG: + return syslogWriter.Debug(msgArgs) + } + return nil + }() + } + return entryString +} + +// logEntry emits a formatted log entry +func logEntry(logLevel LogLevel, message string, args ...interface{}) string { + entryString := message + for _, s := range args { + entryString += fmt.Sprintf(" %s", s) + } + return logFormattedEntry(logLevel, entryString) +} + +// logErrorEntry emits a log entry based on given error object +func logErrorEntry(logLevel LogLevel, err error) error { + if err == nil { + // No error + return nil + } + entryString := fmt.Sprintf("%+v", err) + logEntry(logLevel, entryString) + if printStackTrace { + debug.PrintStack() + } + return err +} + +func Debug(message string, args ...interface{}) string { + return logEntry(DEBUG, message, args...) +} + +func Debugf(message string, args ...interface{}) string { + return logFormattedEntry(DEBUG, message, args...) +} + +func Info(message string, args ...interface{}) string { + return logEntry(INFO, message, args...) +} + +func Infof(message string, args ...interface{}) string { + return logFormattedEntry(INFO, message, args...) +} + +func Notice(message string, args ...interface{}) string { + return logEntry(NOTICE, message, args...) +} + +func Noticef(message string, args ...interface{}) string { + return logFormattedEntry(NOTICE, message, args...) +} + +func Warning(message string, args ...interface{}) error { + return errors.New(logEntry(WARNING, message, args...)) +} + +func Warningf(message string, args ...interface{}) error { + return errors.New(logFormattedEntry(WARNING, message, args...)) +} + +func Error(message string, args ...interface{}) error { + return errors.New(logEntry(ERROR, message, args...)) +} + +func Errorf(message string, args ...interface{}) error { + return errors.New(logFormattedEntry(ERROR, message, args...)) +} + +func Errore(err error) error { + return logErrorEntry(ERROR, err) +} + +func Critical(message string, args ...interface{}) error { + return errors.New(logEntry(CRITICAL, message, args...)) +} + +func Criticalf(message string, args ...interface{}) error { + return errors.New(logFormattedEntry(CRITICAL, message, args...)) +} + +func Criticale(err error) error { + return logErrorEntry(CRITICAL, err) +} + +// Fatal emits a FATAL level entry and exists the program +func Fatal(message string, args ...interface{}) error { + logEntry(FATAL, message, args...) + os.Exit(1) + return errors.New(logEntry(CRITICAL, message, args...)) +} + +// Fatalf emits a FATAL level entry and exists the program +func Fatalf(message string, args ...interface{}) error { + logFormattedEntry(FATAL, message, args...) + os.Exit(1) + return errors.New(logFormattedEntry(CRITICAL, message, args...)) +} + +// Fatale emits a FATAL level entry and exists the program +func Fatale(err error) error { + logErrorEntry(FATAL, err) + os.Exit(1) + return err +} diff --git a/go/vt/orchestrator/external/golib/math/math.go b/go/vt/orchestrator/external/golib/math/math.go new file mode 100644 index 0000000000..f1f2068e4e --- /dev/null +++ b/go/vt/orchestrator/external/golib/math/math.go @@ -0,0 +1,119 @@ +/* + Copyright 2014 Shlomi Noach. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package math + +func MinInt(i1, i2 int) int { + if i1 < i2 { + return i1 + } + return i2 +} + +func MaxInt(i1, i2 int) int { + if i1 > i2 { + return i1 + } + return i2 +} + +func MinInt64(i1, i2 int64) int64 { + if i1 < i2 { + return i1 + } + return i2 +} + +func MaxInt64(i1, i2 int64) int64 { + if i1 > i2 { + return i1 + } + return i2 +} + +func MinUInt(i1, i2 uint) uint { + if i1 < i2 { + return i1 + } + return i2 +} + +func MaxUInt(i1, i2 uint) uint { + if i1 > i2 { + return i1 + } + return i2 +} + +func MinUInt64(i1, i2 uint64) uint64 { + if i1 < i2 { + return i1 + } + return i2 +} + +func MaxUInt64(i1, i2 uint64) uint64 { + if i1 > i2 { + return i1 + } + return i2 +} + +func MinString(i1, i2 string) string { + if i1 < i2 { + return i1 + } + return i2 +} + +func MaxString(i1, i2 string) string { + if i1 > i2 { + return i1 + } + return i2 +} + +// TernaryString acts like a "? :" C-style ternary operator for strings +func TernaryString(condition bool, resTrue string, resFalse string) string { + if condition { + return resTrue + } + return resFalse +} + +// TernaryString acts like a "? :" C-style ternary operator for ints +func TernaryInt(condition bool, resTrue int, resFalse int) int { + if condition { + return resTrue + } + return resFalse +} + +// AbsInt is an ABS function for int type +func AbsInt(i int) int { + if i >= 0 { + return i + } + return -i +} + +// AbsInt64 is an ABS function for int64 type +func AbsInt64(i int64) int64 { + if i >= 0 { + return i + } + return -i +} diff --git a/go/vt/orchestrator/external/golib/sqlutils/dialect.go b/go/vt/orchestrator/external/golib/sqlutils/dialect.go new file mode 100644 index 0000000000..19cb55d258 --- /dev/null +++ b/go/vt/orchestrator/external/golib/sqlutils/dialect.go @@ -0,0 +1,49 @@ +/* + Copyright 2017 GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sqlutils + +import ( + "regexp" + "strings" +) + +type regexpMap struct { + r *regexp.Regexp + replacement string +} + +func (this *regexpMap) process(text string) (result string) { + return this.r.ReplaceAllString(text, this.replacement) +} + +func rmap(regexpExpression string, replacement string) regexpMap { + return regexpMap{ + r: regexp.MustCompile(regexpSpaces(regexpExpression)), + replacement: replacement, + } +} + +func regexpSpaces(statement string) string { + return strings.Replace(statement, " ", `[\s]+`, -1) +} + +func applyConversions(statement string, conversions []regexpMap) string { + for _, rmap := range conversions { + statement = rmap.process(statement) + } + return statement +} diff --git a/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect.go b/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect.go new file mode 100644 index 0000000000..5937aa42a3 --- /dev/null +++ b/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect.go @@ -0,0 +1,130 @@ +/* + Copyright 2017 GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// What's this about? +// This is a brute-force regular-expression based conversion from MySQL syntax to sqlite3 syntax. +// It is NOT meant to be a general purpose solution and is only expected & confirmed to run on +// queries issued by orchestrator. There are known limitations to this design. +// It's not even pretty. +// In fact... +// Well, it gets the job done at this time. Call it debt. + +package sqlutils + +import ( + "regexp" +) + +var sqlite3CreateTableConversions = []regexpMap{ + rmap(`(?i) (character set|charset) [\S]+`, ``), + rmap(`(?i)int unsigned`, `int`), + rmap(`(?i)int[\s]*[(][\s]*([0-9]+)[\s]*[)] unsigned`, `int`), + rmap(`(?i)engine[\s]*=[\s]*(innodb|myisam|ndb|memory|tokudb)`, ``), + rmap(`(?i)DEFAULT CHARSET[\s]*=[\s]*[\S]+`, ``), + rmap(`(?i)[\S]*int( not null|) auto_increment`, `integer`), + rmap(`(?i)comment '[^']*'`, ``), + rmap(`(?i)after [\S]+`, ``), + rmap(`(?i)alter table ([\S]+) add (index|key) ([\S]+) (.+)`, `create index ${3}_${1} on $1 $4`), + rmap(`(?i)alter table ([\S]+) add unique (index|key) ([\S]+) (.+)`, `create unique index ${3}_${1} on $1 $4`), + rmap(`(?i)([\S]+) enum[\s]*([(].*?[)])`, `$1 text check($1 in $2)`), + rmap(`(?i)([\s\S]+[/][*] sqlite3-skip [*][/][\s\S]+)`, ``), + rmap(`(?i)timestamp default current_timestamp`, `timestamp default ('')`), + rmap(`(?i)timestamp not null default current_timestamp`, `timestamp not null default ('')`), + + rmap(`(?i)add column (.*int) not null[\s]*$`, `add column $1 not null default 0`), + rmap(`(?i)add column (.* text) not null[\s]*$`, `add column $1 not null default ''`), + rmap(`(?i)add column (.* varchar.*) not null[\s]*$`, `add column $1 not null default ''`), +} + +var sqlite3InsertConversions = []regexpMap{ + rmap(`(?i)insert ignore ([\s\S]+) on duplicate key update [\s\S]+`, `insert or ignore $1`), + rmap(`(?i)insert ignore`, `insert or ignore`), + rmap(`(?i)now[(][)]`, `datetime('now')`), + rmap(`(?i)insert into ([\s\S]+) on duplicate key update [\s\S]+`, `replace into $1`), +} + +var sqlite3GeneralConversions = []regexpMap{ + rmap(`(?i)now[(][)][\s]*[-][\s]*interval [?] ([\w]+)`, `datetime('now', printf('-%d $1', ?))`), + rmap(`(?i)now[(][)][\s]*[+][\s]*interval [?] ([\w]+)`, `datetime('now', printf('+%d $1', ?))`), + rmap(`(?i)now[(][)][\s]*[-][\s]*interval ([0-9.]+) ([\w]+)`, `datetime('now', '-${1} $2')`), + rmap(`(?i)now[(][)][\s]*[+][\s]*interval ([0-9.]+) ([\w]+)`, `datetime('now', '+${1} $2')`), + + rmap(`(?i)[=<>\s]([\S]+[.][\S]+)[\s]*[-][\s]*interval [?] ([\w]+)`, ` datetime($1, printf('-%d $2', ?))`), + rmap(`(?i)[=<>\s]([\S]+[.][\S]+)[\s]*[+][\s]*interval [?] ([\w]+)`, ` datetime($1, printf('+%d $2', ?))`), + + rmap(`(?i)unix_timestamp[(][)]`, `strftime('%s', 'now')`), + rmap(`(?i)unix_timestamp[(]([^)]+)[)]`, `strftime('%s', $1)`), + rmap(`(?i)now[(][)]`, `datetime('now')`), + rmap(`(?i)cast[(][\s]*([\S]+) as signed[\s]*[)]`, `cast($1 as integer)`), + + rmap(`(?i)\bconcat[(][\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*[)]`, `($1 || $2)`), + rmap(`(?i)\bconcat[(][\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*,[\s]*([^,)]+)[\s]*[)]`, `($1 || $2 || $3)`), + + rmap(`(?i) rlike `, ` like `), + + rmap(`(?i)create index([\s\S]+)[(][\s]*[0-9]+[\s]*[)]([\s\S]+)`, `create index ${1}${2}`), + rmap(`(?i)drop index ([\S]+) on ([\S]+)`, `drop index if exists $1`), +} + +var ( + sqlite3IdentifyCreateTableStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*create table`)) + sqlite3IdentifyCreateIndexStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*create( unique|) index`)) + sqlite3IdentifyDropIndexStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*drop index`)) + sqlite3IdentifyAlterTableStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*alter table`)) + sqlite3IdentifyInsertStatement = regexp.MustCompile(regexpSpaces(`(?i)^[\s]*(insert|replace)`)) +) + +func IsInsert(statement string) bool { + return sqlite3IdentifyInsertStatement.MatchString(statement) +} + +func IsCreateTable(statement string) bool { + return sqlite3IdentifyCreateTableStatement.MatchString(statement) +} + +func IsCreateIndex(statement string) bool { + return sqlite3IdentifyCreateIndexStatement.MatchString(statement) +} + +func IsDropIndex(statement string) bool { + return sqlite3IdentifyDropIndexStatement.MatchString(statement) +} + +func IsAlterTable(statement string) bool { + return sqlite3IdentifyAlterTableStatement.MatchString(statement) +} + +func ToSqlite3CreateTable(statement string) string { + return applyConversions(statement, sqlite3CreateTableConversions) +} + +func ToSqlite3Insert(statement string) string { + return applyConversions(statement, sqlite3InsertConversions) +} + +func ToSqlite3Dialect(statement string) (translated string) { + if IsCreateTable(statement) { + return ToSqlite3CreateTable(statement) + } + if IsAlterTable(statement) { + return ToSqlite3CreateTable(statement) + } + statement = applyConversions(statement, sqlite3GeneralConversions) + if IsInsert(statement) { + return ToSqlite3Insert(statement) + } + return statement +} diff --git a/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect_test.go b/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect_test.go new file mode 100644 index 0000000000..fa11fd5e2f --- /dev/null +++ b/go/vt/orchestrator/external/golib/sqlutils/sqlite_dialect_test.go @@ -0,0 +1,242 @@ +/* + Copyright 2017 GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sqlutils + +import ( + "regexp" + "strings" + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +var spacesRegexp = regexp.MustCompile(`[\s]+`) + +func init() { +} + +func stripSpaces(statement string) string { + statement = strings.TrimSpace(statement) + statement = spacesRegexp.ReplaceAllString(statement, " ") + return statement +} + +func TestIsCreateTable(t *testing.T) { + test.S(t).ExpectTrue(IsCreateTable("create table t(id int)")) + test.S(t).ExpectTrue(IsCreateTable(" create table t(id int)")) + test.S(t).ExpectTrue(IsCreateTable("CREATE TABLE t(id int)")) + test.S(t).ExpectTrue(IsCreateTable(` + create table t(id int) + `)) + test.S(t).ExpectFalse(IsCreateTable("where create table t(id int)")) + test.S(t).ExpectFalse(IsCreateTable("insert")) +} + +func TestToSqlite3CreateTable(t *testing.T) { + { + statement := "create table t(id int)" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, statement) + } + { + statement := "create table t(id int, v varchar(123) CHARACTER SET ascii NOT NULL default '')" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, "create table t(id int, v varchar(123) NOT NULL default '')") + } + { + statement := "create table t(id int, v varchar ( 123 ) CHARACTER SET ascii NOT NULL default '')" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, "create table t(id int, v varchar ( 123 ) NOT NULL default '')") + } + { + statement := "create table t(i smallint unsigned)" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, "create table t(i smallint)") + } + { + statement := "create table t(i smallint(5) unsigned)" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, "create table t(i smallint)") + } + { + statement := "create table t(i smallint ( 5 ) unsigned)" + result := ToSqlite3CreateTable(statement) + test.S(t).ExpectEquals(result, "create table t(i smallint)") + } +} + +func TestToSqlite3AlterTable(t *testing.T) { + { + statement := ` + ALTER TABLE + database_instance + ADD COLUMN sql_delay INT UNSIGNED NOT NULL AFTER slave_lag_seconds + ` + result := stripSpaces(ToSqlite3Dialect(statement)) + test.S(t).ExpectEquals(result, stripSpaces(` + ALTER TABLE + database_instance + add column sql_delay int not null default 0 + `)) + } + { + statement := ` + ALTER TABLE + database_instance + ADD INDEX master_host_port_idx (master_host, master_port) + ` + result := stripSpaces(ToSqlite3Dialect(statement)) + test.S(t).ExpectEquals(result, stripSpaces(` + create index + master_host_port_idx_database_instance + on database_instance (master_host, master_port) + `)) + } + { + statement := ` + ALTER TABLE + topology_recovery + ADD KEY last_detection_idx (last_detection_id) + ` + result := stripSpaces(ToSqlite3Dialect(statement)) + test.S(t).ExpectEquals(result, stripSpaces(` + create index + last_detection_idx_topology_recovery + on topology_recovery (last_detection_id) + `)) + } + +} + +func TestCreateIndex(t *testing.T) { + { + statement := ` + create index + master_host_port_idx_database_instance + on database_instance (master_host(128), master_port) + ` + result := stripSpaces(ToSqlite3Dialect(statement)) + test.S(t).ExpectEquals(result, stripSpaces(` + create index + master_host_port_idx_database_instance + on database_instance (master_host, master_port) + `)) + } +} + +func TestIsInsert(t *testing.T) { + test.S(t).ExpectTrue(IsInsert("insert into t")) + test.S(t).ExpectTrue(IsInsert("insert ignore into t")) + test.S(t).ExpectTrue(IsInsert(` + insert ignore into t + `)) + test.S(t).ExpectFalse(IsInsert("where create table t(id int)")) + test.S(t).ExpectFalse(IsInsert("create table t(id int)")) + test.S(t).ExpectTrue(IsInsert(` + insert into + cluster_domain_name (cluster_name, domain_name, last_registered) + values + (?, ?, datetime('now')) + on duplicate key update + domain_name=values(domain_name), + last_registered=values(last_registered) + `)) +} + +func TestToSqlite3Insert(t *testing.T) { + { + statement := ` + insert into + cluster_domain_name (cluster_name, domain_name, last_registered) + values + (?, ?, datetime('now')) + on duplicate key update + domain_name=values(domain_name), + last_registered=values(last_registered) + ` + result := stripSpaces(ToSqlite3Dialect(statement)) + test.S(t).ExpectEquals(result, stripSpaces(` + replace into + cluster_domain_name (cluster_name, domain_name, last_registered) + values + (?, ?, datetime('now')) + `)) + } +} + +func TestToSqlite3GeneralConversions(t *testing.T) { + { + statement := "select now()" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select datetime('now')") + } + { + statement := "select now() - interval ? second" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select datetime('now', printf('-%d second', ?))") + } + { + statement := "select now() + interval ? minute" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select datetime('now', printf('+%d minute', ?))") + } + { + statement := "select now() + interval 5 minute" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select datetime('now', '+5 minute')") + } + { + statement := "select some_table.some_column + interval ? minute" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select datetime(some_table.some_column, printf('+%d minute', ?))") + } + { + statement := "AND master_instance.last_attempted_check <= master_instance.last_seen + interval ? minute" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "AND master_instance.last_attempted_check <= datetime(master_instance.last_seen, printf('+%d minute', ?))") + } + { + statement := "select concat(master_instance.port, '') as port" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select (master_instance.port || '') as port") + } + { + statement := "select concat( 'abc' , 'def') as s" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select ('abc' || 'def') as s") + } + { + statement := "select concat( 'abc' , 'def', last.col) as s" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select ('abc' || 'def' || last.col) as s") + } + { + statement := "select concat(myself.only) as s" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select concat(myself.only) as s") + } + { + statement := "select concat(1, '2', 3, '4') as s" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select concat(1, '2', 3, '4') as s") + } + { + statement := "select group_concat( 'abc' , 'def') as s" + result := ToSqlite3Dialect(statement) + test.S(t).ExpectEquals(result, "select group_concat( 'abc' , 'def') as s") + } +} diff --git a/go/vt/orchestrator/external/golib/sqlutils/sqlutils.go b/go/vt/orchestrator/external/golib/sqlutils/sqlutils.go new file mode 100644 index 0000000000..a61d0c06e3 --- /dev/null +++ b/go/vt/orchestrator/external/golib/sqlutils/sqlutils.go @@ -0,0 +1,429 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package sqlutils + +import ( + "database/sql" + "encoding/json" + "errors" + "fmt" + "strconv" + "strings" + "sync" + "time" + + _ "github.com/go-sql-driver/mysql" + _ "github.com/mattn/go-sqlite3" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +const DateTimeFormat = "2006-01-02 15:04:05.999999" + +// RowMap represents one row in a result set. Its objective is to allow +// for easy, typed getters by column name. +type RowMap map[string]CellData + +// Cell data is the result of a single (atomic) column in a single row +type CellData sql.NullString + +func (this *CellData) MarshalJSON() ([]byte, error) { + if this.Valid { + return json.Marshal(this.String) + } else { + return json.Marshal(nil) + } +} + +// UnmarshalJSON reds this object from JSON +func (this *CellData) UnmarshalJSON(b []byte) error { + var s string + if err := json.Unmarshal(b, &s); err != nil { + return err + } + (*this).String = s + (*this).Valid = true + + return nil +} + +func (this *CellData) NullString() *sql.NullString { + return (*sql.NullString)(this) +} + +// RowData is the result of a single row, in positioned array format +type RowData []CellData + +// MarshalJSON will marshal this map as JSON +func (this *RowData) MarshalJSON() ([]byte, error) { + cells := make([](*CellData), len(*this), len(*this)) + for i, val := range *this { + d := CellData(val) + cells[i] = &d + } + return json.Marshal(cells) +} + +func (this *RowData) Args() []interface{} { + result := make([]interface{}, len(*this)) + for i := range *this { + result[i] = (*(*this)[i].NullString()) + } + return result +} + +// ResultData is an ordered row set of RowData +type ResultData []RowData +type NamedResultData struct { + Columns []string + Data ResultData +} + +var EmptyResultData = ResultData{} + +func (this *RowMap) GetString(key string) string { + return (*this)[key].String +} + +// GetStringD returns a string from the map, or a default value if the key does not exist +func (this *RowMap) GetStringD(key string, def string) string { + if cell, ok := (*this)[key]; ok { + return cell.String + } + return def +} + +func (this *RowMap) GetInt64(key string) int64 { + res, _ := strconv.ParseInt(this.GetString(key), 10, 0) + return res +} + +func (this *RowMap) GetNullInt64(key string) sql.NullInt64 { + i, err := strconv.ParseInt(this.GetString(key), 10, 0) + if err == nil { + return sql.NullInt64{Int64: i, Valid: true} + } else { + return sql.NullInt64{Valid: false} + } +} + +func (this *RowMap) GetInt(key string) int { + res, _ := strconv.Atoi(this.GetString(key)) + return res +} + +func (this *RowMap) GetIntD(key string, def int) int { + res, err := strconv.Atoi(this.GetString(key)) + if err != nil { + return def + } + return res +} + +func (this *RowMap) GetUint(key string) uint { + res, _ := strconv.ParseUint(this.GetString(key), 10, 0) + return uint(res) +} + +func (this *RowMap) GetUintD(key string, def uint) uint { + res, err := strconv.Atoi(this.GetString(key)) + if err != nil { + return def + } + return uint(res) +} + +func (this *RowMap) GetUint64(key string) uint64 { + res, _ := strconv.ParseUint(this.GetString(key), 10, 0) + return res +} + +func (this *RowMap) GetUint64D(key string, def uint64) uint64 { + res, err := strconv.ParseUint(this.GetString(key), 10, 0) + if err != nil { + return def + } + return uint64(res) +} + +func (this *RowMap) GetBool(key string) bool { + return this.GetInt(key) != 0 +} + +func (this *RowMap) GetTime(key string) time.Time { + if t, err := time.Parse(DateTimeFormat, this.GetString(key)); err == nil { + return t + } + return time.Time{} +} + +// knownDBs is a DB cache by uri +var knownDBs map[string]*sql.DB = make(map[string]*sql.DB) +var knownDBsMutex = &sync.Mutex{} + +// GetDB returns a DB instance based on uri. +// bool result indicates whether the DB was returned from cache; err +func GetGenericDB(driverName, dataSourceName string) (*sql.DB, bool, error) { + knownDBsMutex.Lock() + defer func() { + knownDBsMutex.Unlock() + }() + + var exists bool + if _, exists = knownDBs[dataSourceName]; !exists { + if db, err := sql.Open(driverName, dataSourceName); err == nil { + knownDBs[dataSourceName] = db + } else { + return db, exists, err + } + } + return knownDBs[dataSourceName], exists, nil +} + +// GetDB returns a MySQL DB instance based on uri. +// bool result indicates whether the DB was returned from cache; err +func GetDB(mysql_uri string) (*sql.DB, bool, error) { + return GetGenericDB("mysql", mysql_uri) +} + +// GetDB returns a SQLite DB instance based on DB file name. +// bool result indicates whether the DB was returned from cache; err +func GetSQLiteDB(dbFile string) (*sql.DB, bool, error) { + return GetGenericDB("sqlite3", dbFile) +} + +// RowToArray is a convenience function, typically not called directly, which maps a +// single read database row into a NullString +func RowToArray(rows *sql.Rows, columns []string) []CellData { + buff := make([]interface{}, len(columns)) + data := make([]CellData, len(columns)) + for i := range buff { + buff[i] = data[i].NullString() + } + rows.Scan(buff...) + return data +} + +// ScanRowsToArrays is a convenience function, typically not called directly, which maps rows +// already read from the databse into arrays of NullString +func ScanRowsToArrays(rows *sql.Rows, on_row func([]CellData) error) error { + columns, _ := rows.Columns() + for rows.Next() { + arr := RowToArray(rows, columns) + err := on_row(arr) + if err != nil { + return err + } + } + return nil +} + +func rowToMap(row []CellData, columns []string) map[string]CellData { + m := make(map[string]CellData) + for k, data_col := range row { + m[columns[k]] = data_col + } + return m +} + +// ScanRowsToMaps is a convenience function, typically not called directly, which maps rows +// already read from the databse into RowMap entries. +func ScanRowsToMaps(rows *sql.Rows, on_row func(RowMap) error) error { + columns, _ := rows.Columns() + err := ScanRowsToArrays(rows, func(arr []CellData) error { + m := rowToMap(arr, columns) + err := on_row(m) + if err != nil { + return err + } + return nil + }) + return err +} + +// QueryRowsMap is a convenience function allowing querying a result set while poviding a callback +// function activated per read row. +func QueryRowsMap(db *sql.DB, query string, on_row func(RowMap) error, args ...interface{}) (err error) { + defer func() { + if derr := recover(); derr != nil { + err = fmt.Errorf("QueryRowsMap unexpected error: %+v", derr) + } + }() + + var rows *sql.Rows + rows, err = db.Query(query, args...) + if rows != nil { + defer rows.Close() + } + if err != nil && err != sql.ErrNoRows { + return log.Errore(err) + } + err = ScanRowsToMaps(rows, on_row) + return +} + +// queryResultData returns a raw array of rows for a given query, optionally reading and returning column names +func queryResultData(db *sql.DB, query string, retrieveColumns bool, args ...interface{}) (resultData ResultData, columns []string, err error) { + defer func() { + if derr := recover(); derr != nil { + err = errors.New(fmt.Sprintf("QueryRowsMap unexpected error: %+v", derr)) + } + }() + + var rows *sql.Rows + rows, err = db.Query(query, args...) + defer rows.Close() + if err != nil && err != sql.ErrNoRows { + return EmptyResultData, columns, log.Errore(err) + } + if retrieveColumns { + // Don't pay if you don't want to + columns, _ = rows.Columns() + } + resultData = ResultData{} + err = ScanRowsToArrays(rows, func(rowData []CellData) error { + resultData = append(resultData, rowData) + return nil + }) + return resultData, columns, err +} + +// QueryResultData returns a raw array of rows +func QueryResultData(db *sql.DB, query string, args ...interface{}) (ResultData, error) { + resultData, _, err := queryResultData(db, query, false, args...) + return resultData, err +} + +// QueryResultDataNamed returns a raw array of rows, with column names +func QueryNamedResultData(db *sql.DB, query string, args ...interface{}) (NamedResultData, error) { + resultData, columns, err := queryResultData(db, query, true, args...) + return NamedResultData{Columns: columns, Data: resultData}, err +} + +// QueryRowsMapBuffered reads data from the database into a buffer, and only then applies the given function per row. +// This allows the application to take its time with processing the data, albeit consuming as much memory as required by +// the result set. +func QueryRowsMapBuffered(db *sql.DB, query string, on_row func(RowMap) error, args ...interface{}) error { + resultData, columns, err := queryResultData(db, query, true, args...) + if err != nil { + // Already logged + return err + } + for _, row := range resultData { + err = on_row(rowToMap(row, columns)) + if err != nil { + return err + } + } + return nil +} + +// ExecNoPrepare executes given query using given args on given DB, without using prepared statements. +func ExecNoPrepare(db *sql.DB, query string, args ...interface{}) (res sql.Result, err error) { + defer func() { + if derr := recover(); derr != nil { + err = errors.New(fmt.Sprintf("ExecNoPrepare unexpected error: %+v", derr)) + } + }() + + res, err = db.Exec(query, args...) + if err != nil { + log.Errore(err) + } + return res, err +} + +// ExecQuery executes given query using given args on given DB. It will safele prepare, execute and close +// the statement. +func execInternal(silent bool, db *sql.DB, query string, args ...interface{}) (res sql.Result, err error) { + defer func() { + if derr := recover(); derr != nil { + err = errors.New(fmt.Sprintf("execInternal unexpected error: %+v", derr)) + } + }() + var stmt *sql.Stmt + stmt, err = db.Prepare(query) + if err != nil { + return nil, err + } + defer stmt.Close() + res, err = stmt.Exec(args...) + if err != nil && !silent { + log.Errore(err) + } + return res, err +} + +// Exec executes given query using given args on given DB. It will safele prepare, execute and close +// the statement. +func Exec(db *sql.DB, query string, args ...interface{}) (sql.Result, error) { + return execInternal(false, db, query, args...) +} + +// ExecSilently acts like Exec but does not report any error +func ExecSilently(db *sql.DB, query string, args ...interface{}) (sql.Result, error) { + return execInternal(true, db, query, args...) +} + +func InClauseStringValues(terms []string) string { + quoted := []string{} + for _, s := range terms { + quoted = append(quoted, fmt.Sprintf("'%s'", strings.Replace(s, ",", "''", -1))) + } + return strings.Join(quoted, ", ") +} + +// Convert variable length arguments into arguments array +func Args(args ...interface{}) []interface{} { + return args +} + +func NilIfZero(i int64) interface{} { + if i == 0 { + return nil + } + return i +} + +func ScanTable(db *sql.DB, tableName string) (NamedResultData, error) { + query := fmt.Sprintf("select * from %s", tableName) + return QueryNamedResultData(db, query) +} + +func WriteTable(db *sql.DB, tableName string, data NamedResultData) (err error) { + if len(data.Data) == 0 { + return nil + } + if len(data.Columns) == 0 { + return nil + } + placeholders := make([]string, len(data.Columns)) + for i := range placeholders { + placeholders[i] = "?" + } + query := fmt.Sprintf( + `replace into %s (%s) values (%s)`, + tableName, + strings.Join(data.Columns, ","), + strings.Join(placeholders, ","), + ) + for _, rowData := range data.Data { + if _, execErr := db.Exec(query, rowData.Args()...); execErr != nil { + err = execErr + } + } + return err +} diff --git a/go/vt/orchestrator/external/golib/tests/spec.go b/go/vt/orchestrator/external/golib/tests/spec.go new file mode 100644 index 0000000000..a52c7291a9 --- /dev/null +++ b/go/vt/orchestrator/external/golib/tests/spec.go @@ -0,0 +1,76 @@ +package tests + +import ( + "testing" +) + +// Spec is an access point to test Expections +type Spec struct { + t *testing.T +} + +// S generates a spec. You will want to use it once in a test file, once in a test or once per each check +func S(t *testing.T) *Spec { + return &Spec{t: t} +} + +// ExpectNil expects given value to be nil, or errors +func (spec *Spec) ExpectNil(actual interface{}) { + if actual == nil { + return + } + spec.t.Errorf("Expected %+v to be nil", actual) +} + +// ExpectNotNil expects given value to be not nil, or errors +func (spec *Spec) ExpectNotNil(actual interface{}) { + if actual != nil { + return + } + spec.t.Errorf("Expected %+v to be not nil", actual) +} + +// ExpectEquals expects given values to be equal (comparison via `==`), or errors +func (spec *Spec) ExpectEquals(actual, value interface{}) { + if actual == value { + return + } + spec.t.Errorf("Expected:\n[[[%+v]]]\n- got:\n[[[%+v]]]", value, actual) +} + +// ExpectNotEquals expects given values to be nonequal (comparison via `==`), or errors +func (spec *Spec) ExpectNotEquals(actual, value interface{}) { + if !(actual == value) { + return + } + spec.t.Errorf("Expected not %+v", value) +} + +// ExpectEqualsAny expects given actual to equal (comparison via `==`) at least one of given values, or errors +func (spec *Spec) ExpectEqualsAny(actual interface{}, values ...interface{}) { + for _, value := range values { + if actual == value { + return + } + } + spec.t.Errorf("Expected %+v to equal any of given values", actual) +} + +// ExpectNotEqualsAny expects given actual to be nonequal (comparison via `==`)tp any of given values, or errors +func (spec *Spec) ExpectNotEqualsAny(actual interface{}, values ...interface{}) { + for _, value := range values { + if actual == value { + spec.t.Errorf("Expected not %+v", value) + } + } +} + +// ExpectFalse expects given values to be false, or errors +func (spec *Spec) ExpectFalse(actual interface{}) { + spec.ExpectEquals(actual, false) +} + +// ExpectTrue expects given values to be true, or errors +func (spec *Spec) ExpectTrue(actual interface{}) { + spec.ExpectEquals(actual, true) +} diff --git a/go/vt/orchestrator/external/golib/util/text.go b/go/vt/orchestrator/external/golib/util/text.go new file mode 100644 index 0000000000..2b0ae9a7e8 --- /dev/null +++ b/go/vt/orchestrator/external/golib/util/text.go @@ -0,0 +1,103 @@ +/* + Copyright 2015 Shlomi Noach. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package util + +import ( + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +const ( + TabulateLeft = 0 + TabulateRight = 1 +) + +// ParseSimpleTime parses input in the format 7s, 55m, 3h, 31d, 4w (second, minute, hour, day, week) +// The time.ParseDuration() function should have done this, but it does not support "d" and "w" extensions. +func SimpleTimeToSeconds(simpleTime string) (int, error) { + if matched, _ := regexp.MatchString("^[0-9]+s$", simpleTime); matched { + i, _ := strconv.Atoi(simpleTime[0 : len(simpleTime)-1]) + return i, nil + } + if matched, _ := regexp.MatchString("^[0-9]+m$", simpleTime); matched { + i, _ := strconv.Atoi(simpleTime[0 : len(simpleTime)-1]) + return i * 60, nil + } + if matched, _ := regexp.MatchString("^[0-9]+h$", simpleTime); matched { + i, _ := strconv.Atoi(simpleTime[0 : len(simpleTime)-1]) + return i * 60 * 60, nil + } + if matched, _ := regexp.MatchString("^[0-9]+d$", simpleTime); matched { + i, _ := strconv.Atoi(simpleTime[0 : len(simpleTime)-1]) + return i * 60 * 60 * 24, nil + } + if matched, _ := regexp.MatchString("^[0-9]+w$", simpleTime); matched { + i, _ := strconv.Atoi(simpleTime[0 : len(simpleTime)-1]) + return i * 60 * 60 * 24 * 7, nil + } + return 0, errors.New(fmt.Sprintf("Cannot parse simple time: %s", simpleTime)) +} + +func Tabulate(lines []string, separator string, outputSeparator string, directionFlags ...int) (result []string) { + tokens := make([][]string, 0) + widths := make([][]int, 0) + countColumns := 0 + for _, line := range lines { + lineTokens := strings.Split(line, separator) + lineWidths := make([]int, len(lineTokens)) + for i := range lineTokens { + lineWidths[i] = len(lineTokens[i]) + } + tokens = append(tokens, lineTokens) + widths = append(widths, lineWidths) + if len(lineTokens) > countColumns { + countColumns = len(lineTokens) + } + } + columnWidths := make([]int, countColumns) + for _, lineTokens := range tokens { + for col, token := range lineTokens { + if len(token) > columnWidths[col] { + columnWidths[col] = len(token) + } + } + } + for _, lineTokens := range tokens { + resultRow := "" + for col := 0; col < countColumns; col++ { + token := "" + if col < len(lineTokens) { + token = lineTokens[col] + } + format := fmt.Sprintf("%%-%ds", columnWidths[col]) // format left + if col < len(directionFlags) && directionFlags[col] == TabulateRight { + format = fmt.Sprintf("%%%ds", columnWidths[col]) + } + formattedToken := fmt.Sprintf(format, token) + if col == 0 { + resultRow = formattedToken + } else { + resultRow = fmt.Sprintf("%s%s%s", resultRow, outputSeparator, formattedToken) + } + } + result = append(result, resultRow) + } + return result +} diff --git a/go/vt/orchestrator/external/golib/util/text_test.go b/go/vt/orchestrator/external/golib/util/text_test.go new file mode 100644 index 0000000000..26068fec70 --- /dev/null +++ b/go/vt/orchestrator/external/golib/util/text_test.go @@ -0,0 +1,88 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package util + +import ( + "reflect" + "strings" + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { +} + +func TestTabulate(t *testing.T) { + { + text := strings.TrimSpace(` +a,b,c +d,e,f +g,h,i + `) + + tabulated := Tabulate(strings.Split(text, "\n"), ",", ",") + expected := strings.Split(text, "\n") + test.S(t).ExpectTrue(reflect.DeepEqual(tabulated, expected)) + } + { + text := strings.TrimSpace(` +a,b,c +d,e,f +g,h,i + `) + + tabulated := Tabulate(strings.Split(text, "\n"), ",", "|") + expected := []string{ + "a|b|c", + "d|e|f", + "g|h|i", + } + test.S(t).ExpectTrue(reflect.DeepEqual(tabulated, expected)) + } + { + text := strings.TrimSpace(` +a,20,c +d,e,100 +0000,h,i + `) + + tabulated := Tabulate(strings.Split(text, "\n"), ",", "|") + expected := []string{ + "a |20|c ", + "d |e |100", + "0000|h |i ", + } + test.S(t).ExpectTrue(reflect.DeepEqual(tabulated, expected)) + } + { + text := strings.TrimSpace(` +a,20,c +d,1,100 +0000,3,i + `) + + tabulated := Tabulate(strings.Split(text, "\n"), ",", "|", TabulateLeft, TabulateRight, TabulateRight) + expected := []string{ + "a |20| c", + "d | 1|100", + "0000| 3| i", + } + + test.S(t).ExpectTrue(reflect.DeepEqual(tabulated, expected)) + } +} diff --git a/go/vt/orchestrator/external/raft/.gitignore b/go/vt/orchestrator/external/raft/.gitignore new file mode 100644 index 0000000000..836562412f --- /dev/null +++ b/go/vt/orchestrator/external/raft/.gitignore @@ -0,0 +1,23 @@ +# Compiled Object files, Static and Dynamic libs (Shared Objects) +*.o +*.a +*.so + +# Folders +_obj +_test + +# Architecture specific extensions/prefixes +*.[568vq] +[568vq].out + +*.cgo1.go +*.cgo2.c +_cgo_defun.c +_cgo_gotypes.go +_cgo_export.* + +_testmain.go + +*.exe +*.test diff --git a/go/vt/orchestrator/external/raft/.travis.yml b/go/vt/orchestrator/external/raft/.travis.yml new file mode 100644 index 0000000000..94eb8668b6 --- /dev/null +++ b/go/vt/orchestrator/external/raft/.travis.yml @@ -0,0 +1,16 @@ +language: go + +go: + - 1.4 + - 1.5 + - 1.6 + - tip + +install: make deps +script: + - make integ + +notifications: + flowdock: + secure: fZrcf9rlh2IrQrlch1sHkn3YI7SKvjGnAl/zyV5D6NROe1Bbr6d3QRMuCXWWdhJHzjKmXk5rIzbqJhUc0PNF7YjxGNKSzqWMQ56KcvN1k8DzlqxpqkcA3Jbs6fXCWo2fssRtZ7hj/wOP1f5n6cc7kzHDt9dgaYJ6nO2fqNPJiTc= + diff --git a/go/vt/orchestrator/external/raft/LICENSE b/go/vt/orchestrator/external/raft/LICENSE new file mode 100644 index 0000000000..c33dcc7c92 --- /dev/null +++ b/go/vt/orchestrator/external/raft/LICENSE @@ -0,0 +1,354 @@ +Mozilla Public License, version 2.0 + +1. Definitions + +1.1. “Contributor” + + means each individual or legal entity that creates, contributes to the + creation of, or owns Covered Software. + +1.2. “Contributor Version” + + means the combination of the Contributions of others (if any) used by a + Contributor and that particular Contributor’s Contribution. + +1.3. “Contribution” + + means Covered Software of a particular Contributor. + +1.4. “Covered Software” + + means Source Code Form to which the initial Contributor has attached the + notice in Exhibit A, the Executable Form of such Source Code Form, and + Modifications of such Source Code Form, in each case including portions + thereof. + +1.5. “Incompatible With Secondary Licenses” + means + + a. that the initial Contributor has attached the notice described in + Exhibit B to the Covered Software; or + + b. that the Covered Software was made available under the terms of version + 1.1 or earlier of the License, but not also under the terms of a + Secondary License. + +1.6. “Executable Form” + + means any form of the work other than Source Code Form. + +1.7. “Larger Work” + + means a work that combines Covered Software with other material, in a separate + file or files, that is not Covered Software. + +1.8. “License” + + means this document. + +1.9. “Licensable” + + means having the right to grant, to the maximum extent possible, whether at the + time of the initial grant or subsequently, any and all of the rights conveyed by + this License. + +1.10. “Modifications” + + means any of the following: + + a. any file in Source Code Form that results from an addition to, deletion + from, or modification of the contents of Covered Software; or + + b. any new file in Source Code Form that contains any Covered Software. + +1.11. “Patent Claims” of a Contributor + + means any patent claim(s), including without limitation, method, process, + and apparatus claims, in any patent Licensable by such Contributor that + would be infringed, but for the grant of the License, by the making, + using, selling, offering for sale, having made, import, or transfer of + either its Contributions or its Contributor Version. + +1.12. “Secondary License” + + means either the GNU General Public License, Version 2.0, the GNU Lesser + General Public License, Version 2.1, the GNU Affero General Public + License, Version 3.0, or any later versions of those licenses. + +1.13. “Source Code Form” + + means the form of the work preferred for making modifications. + +1.14. “You” (or “Your”) + + means an individual or a legal entity exercising rights under this + License. For legal entities, “You” includes any entity that controls, is + controlled by, or is under common control with You. For purposes of this + definition, “control” means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by contract or + otherwise, or (b) ownership of more than fifty percent (50%) of the + outstanding shares or beneficial ownership of such entity. + + +2. License Grants and Conditions + +2.1. Grants + + Each Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + a. under intellectual property rights (other than patent or trademark) + Licensable by such Contributor to use, reproduce, make available, + modify, display, perform, distribute, and otherwise exploit its + Contributions, either on an unmodified basis, with Modifications, or as + part of a Larger Work; and + + b. under Patent Claims of such Contributor to make, use, sell, offer for + sale, have made, import, and otherwise transfer either its Contributions + or its Contributor Version. + +2.2. Effective Date + + The licenses granted in Section 2.1 with respect to any Contribution become + effective for each Contribution on the date the Contributor first distributes + such Contribution. + +2.3. Limitations on Grant Scope + + The licenses granted in this Section 2 are the only rights granted under this + License. No additional rights or licenses will be implied from the distribution + or licensing of Covered Software under this License. Notwithstanding Section + 2.1(b) above, no patent license is granted by a Contributor: + + a. for any code that a Contributor has removed from Covered Software; or + + b. for infringements caused by: (i) Your and any other third party’s + modifications of Covered Software, or (ii) the combination of its + Contributions with other software (except as part of its Contributor + Version); or + + c. under Patent Claims infringed by Covered Software in the absence of its + Contributions. + + This License does not grant any rights in the trademarks, service marks, or + logos of any Contributor (except as may be necessary to comply with the + notice requirements in Section 3.4). + +2.4. Subsequent Licenses + + No Contributor makes additional grants as a result of Your choice to + distribute the Covered Software under a subsequent version of this License + (see Section 10.2) or under the terms of a Secondary License (if permitted + under the terms of Section 3.3). + +2.5. Representation + + Each Contributor represents that the Contributor believes its Contributions + are its original creation(s) or it has sufficient rights to grant the + rights to its Contributions conveyed by this License. + +2.6. Fair Use + + This License is not intended to limit any rights You have under applicable + copyright doctrines of fair use, fair dealing, or other equivalents. + +2.7. Conditions + + Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted in + Section 2.1. + + +3. Responsibilities + +3.1. Distribution of Source Form + + All distribution of Covered Software in Source Code Form, including any + Modifications that You create or to which You contribute, must be under the + terms of this License. You must inform recipients that the Source Code Form + of the Covered Software is governed by the terms of this License, and how + they can obtain a copy of this License. You may not attempt to alter or + restrict the recipients’ rights in the Source Code Form. + +3.2. Distribution of Executable Form + + If You distribute Covered Software in Executable Form then: + + a. such Covered Software must also be made available in Source Code Form, + as described in Section 3.1, and You must inform recipients of the + Executable Form how they can obtain a copy of such Source Code Form by + reasonable means in a timely manner, at a charge no more than the cost + of distribution to the recipient; and + + b. You may distribute such Executable Form under the terms of this License, + or sublicense it under different terms, provided that the license for + the Executable Form does not attempt to limit or alter the recipients’ + rights in the Source Code Form under this License. + +3.3. Distribution of a Larger Work + + You may create and distribute a Larger Work under terms of Your choice, + provided that You also comply with the requirements of this License for the + Covered Software. If the Larger Work is a combination of Covered Software + with a work governed by one or more Secondary Licenses, and the Covered + Software is not Incompatible With Secondary Licenses, this License permits + You to additionally distribute such Covered Software under the terms of + such Secondary License(s), so that the recipient of the Larger Work may, at + their option, further distribute the Covered Software under the terms of + either this License or such Secondary License(s). + +3.4. Notices + + You may not remove or alter the substance of any license notices (including + copyright notices, patent notices, disclaimers of warranty, or limitations + of liability) contained within the Source Code Form of the Covered + Software, except that You may alter any license notices to the extent + required to remedy known factual inaccuracies. + +3.5. Application of Additional Terms + + You may choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of Covered + Software. However, You may do so only on Your own behalf, and not on behalf + of any Contributor. You must make it absolutely clear that any such + warranty, support, indemnity, or liability obligation is offered by You + alone, and You hereby agree to indemnify every Contributor for any + liability incurred by such Contributor as a result of warranty, support, + indemnity or liability terms You offer. You may include additional + disclaimers of warranty and limitations of liability specific to any + jurisdiction. + +4. Inability to Comply Due to Statute or Regulation + + If it is impossible for You to comply with any of the terms of this License + with respect to some or all of the Covered Software due to statute, judicial + order, or regulation then You must: (a) comply with the terms of this License + to the maximum extent possible; and (b) describe the limitations and the code + they affect. Such description must be placed in a text file included with all + distributions of the Covered Software under this License. Except to the + extent prohibited by statute or regulation, such description must be + sufficiently detailed for a recipient of ordinary skill to be able to + understand it. + +5. Termination + +5.1. The rights granted under this License will terminate automatically if You + fail to comply with any of its terms. However, if You become compliant, + then the rights granted under this License from a particular Contributor + are reinstated (a) provisionally, unless and until such Contributor + explicitly and finally terminates Your grants, and (b) on an ongoing basis, + if such Contributor fails to notify You of the non-compliance by some + reasonable means prior to 60 days after You have come back into compliance. + Moreover, Your grants from a particular Contributor are reinstated on an + ongoing basis if such Contributor notifies You of the non-compliance by + some reasonable means, this is the first time You have received notice of + non-compliance with this License from such Contributor, and You become + compliant prior to 30 days after Your receipt of the notice. + +5.2. If You initiate litigation against any entity by asserting a patent + infringement claim (excluding declaratory judgment actions, counter-claims, + and cross-claims) alleging that a Contributor Version directly or + indirectly infringes any patent, then the rights granted to You by any and + all Contributors for the Covered Software under Section 2.1 of this License + shall terminate. + +5.3. In the event of termination under Sections 5.1 or 5.2 above, all end user + license agreements (excluding distributors and resellers) which have been + validly granted by You or Your distributors under this License prior to + termination shall survive termination. + +6. Disclaimer of Warranty + + Covered Software is provided under this License on an “as is” basis, without + warranty of any kind, either expressed, implied, or statutory, including, + without limitation, warranties that the Covered Software is free of defects, + merchantable, fit for a particular purpose or non-infringing. The entire + risk as to the quality and performance of the Covered Software is with You. + Should any Covered Software prove defective in any respect, You (not any + Contributor) assume the cost of any necessary servicing, repair, or + correction. This disclaimer of warranty constitutes an essential part of this + License. No use of any Covered Software is authorized under this License + except under this disclaimer. + +7. Limitation of Liability + + Under no circumstances and under no legal theory, whether tort (including + negligence), contract, or otherwise, shall any Contributor, or anyone who + distributes Covered Software as permitted above, be liable to You for any + direct, indirect, special, incidental, or consequential damages of any + character including, without limitation, damages for lost profits, loss of + goodwill, work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses, even if such party shall have been + informed of the possibility of such damages. This limitation of liability + shall not apply to liability for death or personal injury resulting from such + party’s negligence to the extent applicable law prohibits such limitation. + Some jurisdictions do not allow the exclusion or limitation of incidental or + consequential damages, so this exclusion and limitation may not apply to You. + +8. Litigation + + Any litigation relating to this License may be brought only in the courts of + a jurisdiction where the defendant maintains its principal place of business + and such litigation shall be governed by laws of that jurisdiction, without + reference to its conflict-of-law provisions. Nothing in this Section shall + prevent a party’s ability to bring cross-claims or counter-claims. + +9. Miscellaneous + + This License represents the complete agreement concerning the subject matter + hereof. If any provision of this License is held to be unenforceable, such + provision shall be reformed only to the extent necessary to make it + enforceable. Any law or regulation which provides that the language of a + contract shall be construed against the drafter shall not be used to construe + this License against a Contributor. + + +10. Versions of the License + +10.1. New Versions + + Mozilla Foundation is the license steward. Except as provided in Section + 10.3, no one other than the license steward has the right to modify or + publish new versions of this License. Each version will be given a + distinguishing version number. + +10.2. Effect of New Versions + + You may distribute the Covered Software under the terms of the version of + the License under which You originally received the Covered Software, or + under the terms of any subsequent version published by the license + steward. + +10.3. Modified Versions + + If you create software not governed by this License, and you want to + create a new license for such software, you may create and use a modified + version of this License if you rename the license and remove any + references to the name of the license steward (except to note that such + modified license differs from this License). + +10.4. Distributing Source Code Form that is Incompatible With Secondary Licenses + If You choose to distribute Source Code Form that is Incompatible With + Secondary Licenses under the terms of this version of the License, the + notice described in Exhibit B of this License must be attached. + +Exhibit A - Source Code Form License Notice + + This Source Code Form is subject to the + terms of the Mozilla Public License, v. + 2.0. If a copy of the MPL was not + distributed with this file, You can + obtain one at + http://mozilla.org/MPL/2.0/. + +If it is not possible or desirable to put the notice in a particular file, then +You may include the notice in a location (such as a LICENSE file in a relevant +directory) where a recipient would be likely to look for such a notice. + +You may add additional accurate notices of copyright ownership. + +Exhibit B - “Incompatible With Secondary Licenses” Notice + + This Source Code Form is “Incompatible + With Secondary Licenses”, as defined by + the Mozilla Public License, v. 2.0. + diff --git a/go/vt/orchestrator/external/raft/Makefile b/go/vt/orchestrator/external/raft/Makefile new file mode 100644 index 0000000000..556aa2e20f --- /dev/null +++ b/go/vt/orchestrator/external/raft/Makefile @@ -0,0 +1,17 @@ +DEPS = $(go list -f '{{range .TestImports}}{{.}} {{end}}' ./...) + +test: + go test -timeout=30s ./... + +integ: test + INTEG_TESTS=yes go test -timeout=3s -run=Integ ./... + +deps: + go get -d -v ./... + echo $(DEPS) | xargs -n1 go get -d + +cov: + INTEG_TESTS=yes gocov test github.com/hashicorp/raft | gocov-html > /tmp/coverage.html + open /tmp/coverage.html + +.PHONY: test cov integ deps diff --git a/go/vt/orchestrator/external/raft/README.md b/go/vt/orchestrator/external/raft/README.md new file mode 100644 index 0000000000..760a45a30f --- /dev/null +++ b/go/vt/orchestrator/external/raft/README.md @@ -0,0 +1,89 @@ +raft [![Build Status](https://travis-ci.org/hashicorp/raft.png)](https://travis-ci.org/hashicorp/raft) +==== + +raft is a [Go](http://www.golang.org) library that manages a replicated +log and can be used with an FSM to manage replicated state machines. It +is a library for providing [consensus](http://en.wikipedia.org/wiki/Consensus_(computer_science)). + +The use cases for such a library are far-reaching as replicated state +machines are a key component of many distributed systems. They enable +building Consistent, Partition Tolerant (CP) systems, with limited +fault tolerance as well. + +## Building + +If you wish to build raft you'll need Go version 1.2+ installed. + +Please check your installation with: + +``` +go version +``` + +## Documentation + +For complete documentation, see the associated [Godoc](http://godoc.org/github.com/hashicorp/raft). + +To prevent complications with cgo, the primary backend `MDBStore` is in a separate repository, +called [raft-mdb](http://github.com/hashicorp/raft-mdb). That is the recommended implementation +for the `LogStore` and `StableStore`. + +A pure Go backend using [BoltDB](https://github.com/boltdb/bolt) is also available called +[raft-boltdb](https://github.com/hashicorp/raft-boltdb). It can also be used as a `LogStore` +and `StableStore`. + +## Protocol + +raft is based on ["Raft: In Search of an Understandable Consensus Algorithm"](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) + +A high level overview of the Raft protocol is described below, but for details please read the full +[Raft paper](https://ramcloud.stanford.edu/wiki/download/attachments/11370504/raft.pdf) +followed by the raft source. Any questions about the raft protocol should be sent to the +[raft-dev mailing list](https://groups.google.com/forum/#!forum/raft-dev). + +### Protocol Description + +Raft nodes are always in one of three states: follower, candidate or leader. All +nodes initially start out as a follower. In this state, nodes can accept log entries +from a leader and cast votes. If no entries are received for some time, nodes +self-promote to the candidate state. In the candidate state nodes request votes from +their peers. If a candidate receives a quorum of votes, then it is promoted to a leader. +The leader must accept new log entries and replicate to all the other followers. +In addition, if stale reads are not acceptable, all queries must also be performed on +the leader. + +Once a cluster has a leader, it is able to accept new log entries. A client can +request that a leader append a new log entry, which is an opaque binary blob to +Raft. The leader then writes the entry to durable storage and attempts to replicate +to a quorum of followers. Once the log entry is considered *committed*, it can be +*applied* to a finite state machine. The finite state machine is application specific, +and is implemented using an interface. + +An obvious question relates to the unbounded nature of a replicated log. Raft provides +a mechanism by which the current state is snapshotted, and the log is compacted. Because +of the FSM abstraction, restoring the state of the FSM must result in the same state +as a replay of old logs. This allows Raft to capture the FSM state at a point in time, +and then remove all the logs that were used to reach that state. This is performed automatically +without user intervention, and prevents unbounded disk usage as well as minimizing +time spent replaying logs. + +Lastly, there is the issue of updating the peer set when new servers are joining +or existing servers are leaving. As long as a quorum of nodes is available, this +is not an issue as Raft provides mechanisms to dynamically update the peer set. +If a quorum of nodes is unavailable, then this becomes a very challenging issue. +For example, suppose there are only 2 peers, A and B. The quorum size is also +2, meaning both nodes must agree to commit a log entry. If either A or B fails, +it is now impossible to reach quorum. This means the cluster is unable to add, +or remove a node, or commit any additional log entries. This results in *unavailability*. +At this point, manual intervention would be required to remove either A or B, +and to restart the remaining node in bootstrap mode. + +A Raft cluster of 3 nodes can tolerate a single node failure, while a cluster +of 5 can tolerate 2 node failures. The recommended configuration is to either +run 3 or 5 raft servers. This maximizes availability without +greatly sacrificing performance. + +In terms of performance, Raft is comparable to Paxos. Assuming stable leadership, +committing a log entry requires a single round trip to half of the cluster. +Thus performance is bound by disk I/O and network latency. + diff --git a/go/vt/orchestrator/external/raft/bench/bench.go b/go/vt/orchestrator/external/raft/bench/bench.go new file mode 100644 index 0000000000..8228c0e1b3 --- /dev/null +++ b/go/vt/orchestrator/external/raft/bench/bench.go @@ -0,0 +1,172 @@ +package raftbench + +// raftbench provides common benchmarking functions which can be used by +// anything which implements the raft.LogStore and raft.StableStore interfaces. +// All functions accept these interfaces and perform benchmarking. This +// makes comparing backend performance easier by sharing the tests. + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +func FirstIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run FirstIndex a number of times + for n := 0; n < b.N; n++ { + store.FirstIndex() + } +} + +func LastIndex(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run LastIndex a number of times + for n := 0; n < b.N; n++ { + store.LastIndex() + } +} + +func GetLog(b *testing.B, store raft.LogStore) { + // Create some fake data + var logs []*raft.Log + for i := 1; i < 10; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Run GetLog a number of times + for n := 0; n < b.N; n++ { + if err := store.GetLog(5, new(raft.Log)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLog(b *testing.B, store raft.LogStore) { + // Run StoreLog a number of times + for n := 0; n < b.N; n++ { + log := &raft.Log{Index: uint64(n), Data: []byte("data")} + if err := store.StoreLog(log); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func StoreLogs(b *testing.B, store raft.LogStore) { + // Run StoreLogs a number of times. We want to set multiple logs each + // run, so we create 3 logs with incrementing indexes for each iteration. + for n := 0; n < b.N; n++ { + b.StopTimer() + offset := 3 * (n + 1) + logs := []*raft.Log{ + {Index: uint64(offset - 2), Data: []byte("data")}, + {Index: uint64(offset - 1), Data: []byte("data")}, + {Index: uint64(offset), Data: []byte("data")}, + } + b.StartTimer() + + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func DeleteRange(b *testing.B, store raft.LogStore) { + // Create some fake data. In this case, we create 3 new log entries for each + // test case, and separate them by index in multiples of 10. This allows + // some room so that we can test deleting ranges with "extra" logs to + // to ensure we stop going to the database once our max index is hit. + var logs []*raft.Log + for n := 0; n < b.N; n++ { + offset := 10 * n + for i := offset; i < offset+3; i++ { + logs = append(logs, &raft.Log{Index: uint64(i), Data: []byte("data")}) + } + } + if err := store.StoreLogs(logs); err != nil { + b.Fatalf("err: %s", err) + } + b.ResetTimer() + + // Delete a range of the data + for n := 0; n < b.N; n++ { + offset := 10 * n + if err := store.DeleteRange(uint64(offset), uint64(offset+9)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Set(b *testing.B, store raft.StableStore) { + // Run Set a number of times + for n := 0; n < b.N; n++ { + if err := store.Set([]byte{byte(n)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func Get(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 1; i < 10; i++ { + if err := store.Set([]byte{byte(i)}, []byte("val")); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run Get a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func SetUint64(b *testing.B, store raft.StableStore) { + // Run SetUint64 a number of times + for n := 0; n < b.N; n++ { + if err := store.SetUint64([]byte{byte(n)}, uint64(n)); err != nil { + b.Fatalf("err: %s", err) + } + } +} + +func GetUint64(b *testing.B, store raft.StableStore) { + // Create some fake data + for i := 0; i < 10; i++ { + if err := store.SetUint64([]byte{byte(i)}, uint64(i)); err != nil { + b.Fatalf("err: %s", err) + } + } + b.ResetTimer() + + // Run GetUint64 a number of times + for n := 0; n < b.N; n++ { + if _, err := store.Get([]byte{0x05}); err != nil { + b.Fatalf("err: %s", err) + } + } +} diff --git a/go/vt/orchestrator/external/raft/commands.go b/go/vt/orchestrator/external/raft/commands.go new file mode 100644 index 0000000000..739775b354 --- /dev/null +++ b/go/vt/orchestrator/external/raft/commands.go @@ -0,0 +1,84 @@ +package raft + +// AppendEntriesRequest is the command used to append entries to the +// replicated log. +type AppendEntriesRequest struct { + // Provide the current term and leader + Term uint64 + Leader []byte + + // Provide the previous entries for integrity checking + PrevLogEntry uint64 + PrevLogTerm uint64 + + // New entries to commit + Entries []*Log + + // Commit index on the leader + LeaderCommitIndex uint64 +} + +// AppendEntriesResponse is the response returned from an +// AppendEntriesRequest. +type AppendEntriesResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Last Log is a hint to help accelerate rebuilding slow nodes + LastLog uint64 + + // We may not succeed if we have a conflicting entry + Success bool + + // There are scenarios where this request didn't succeed + // but there's no need to wait/back-off the next attempt. + NoRetryBackoff bool +} + +// RequestVoteRequest is the command used by a candidate to ask a Raft peer +// for a vote in an election. +type RequestVoteRequest struct { + // Provide the term and our id + Term uint64 + Candidate []byte + + // Used to ensure safety + LastLogIndex uint64 + LastLogTerm uint64 +} + +// RequestVoteResponse is the response returned from a RequestVoteRequest. +type RequestVoteResponse struct { + // Newer term if leader is out of date + Term uint64 + + // Return the peers, so that a node can shutdown on removal + Peers []byte + + // Is the vote granted + Granted bool +} + +// InstallSnapshotRequest is the command sent to a Raft peer to bootstrap its +// log (and state machine) from a snapshot on another peer. +type InstallSnapshotRequest struct { + Term uint64 + Leader []byte + + // These are the last index/term included in the snapshot + LastLogIndex uint64 + LastLogTerm uint64 + + // Peer Set in the snapshot + Peers []byte + + // Size of the snapshot + Size int64 +} + +// InstallSnapshotResponse is the response returned from an +// InstallSnapshotRequest. +type InstallSnapshotResponse struct { + Term uint64 + Success bool +} diff --git a/go/vt/orchestrator/external/raft/config.go b/go/vt/orchestrator/external/raft/config.go new file mode 100644 index 0000000000..2dbd5e601b --- /dev/null +++ b/go/vt/orchestrator/external/raft/config.go @@ -0,0 +1,136 @@ +package raft + +import ( + "fmt" + "io" + "log" + "time" +) + +// Config provides any necessary configuration to +// the Raft server +type Config struct { + // HeartbeatTimeout specifies the time in follower state without + // a leader before we attempt an election. + HeartbeatTimeout time.Duration + + // ElectionTimeout specifies the time in candidate state without + // a leader before we attempt an election. + ElectionTimeout time.Duration + + // CommitTimeout controls the time without an Apply() operation + // before we heartbeat to ensure a timely commit. Due to random + // staggering, may be delayed as much as 2x this value. + CommitTimeout time.Duration + + // MaxAppendEntries controls the maximum number of append entries + // to send at once. We want to strike a balance between efficiency + // and avoiding waste if the follower is going to reject because of + // an inconsistent log. + MaxAppendEntries int + + // If we are a member of a cluster, and RemovePeer is invoked for the + // local node, then we forget all peers and transition into the follower state. + // If ShutdownOnRemove is is set, we additional shutdown Raft. Otherwise, + // we can become a leader of a cluster containing only this node. + ShutdownOnRemove bool + + // DisableBootstrapAfterElect is used to turn off EnableSingleNode + // after the node is elected. This is used to prevent self-election + // if the node is removed from the Raft cluster via RemovePeer. Setting + // it to false will keep the bootstrap mode, allowing the node to self-elect + // and potentially bootstrap a separate cluster. + DisableBootstrapAfterElect bool + + // TrailingLogs controls how many logs we leave after a snapshot. This is + // used so that we can quickly replay logs on a follower instead of being + // forced to send an entire snapshot. + TrailingLogs uint64 + + // SnapshotInterval controls how often we check if we should perform a snapshot. + // We randomly stagger between this value and 2x this value to avoid the entire + // cluster from performing a snapshot at once. + SnapshotInterval time.Duration + + // SnapshotThreshold controls how many outstanding logs there must be before + // we perform a snapshot. This is to prevent excessive snapshots when we can + // just replay a small set of logs. + SnapshotThreshold uint64 + + // EnableSingleNode allows for a single node mode of operation. This + // is false by default, which prevents a lone node from electing itself. + // leader. + EnableSingleNode bool + + // LeaderLeaseTimeout is used to control how long the "lease" lasts + // for being the leader without being able to contact a quorum + // of nodes. If we reach this interval without contact, we will + // step down as leader. + LeaderLeaseTimeout time.Duration + + // StartAsLeader forces Raft to start in the leader state. This should + // never be used except for testing purposes, as it can cause a split-brain. + StartAsLeader bool + + // NotifyCh is used to provide a channel that will be notified of leadership + // changes. Raft will block writing to this channel, so it should either be + // buffered or aggressively consumed. + NotifyCh chan<- bool + + // LogOutput is used as a sink for logs, unless Logger is specified. + // Defaults to os.Stderr. + LogOutput io.Writer + + // Logger is a user-provided logger. If nil, a logger writing to LogOutput + // is used. + Logger *log.Logger +} + +// DefaultConfig returns a Config with usable defaults. +func DefaultConfig() *Config { + return &Config{ + HeartbeatTimeout: 1000 * time.Millisecond, + ElectionTimeout: 1000 * time.Millisecond, + CommitTimeout: 50 * time.Millisecond, + MaxAppendEntries: 64, + ShutdownOnRemove: true, + DisableBootstrapAfterElect: true, + TrailingLogs: 10240, + SnapshotInterval: 120 * time.Second, + SnapshotThreshold: 8192, + EnableSingleNode: false, + LeaderLeaseTimeout: 500 * time.Millisecond, + } +} + +// ValidateConfig is used to validate a sane configuration +func ValidateConfig(config *Config) error { + if config.HeartbeatTimeout < 5*time.Millisecond { + return fmt.Errorf("Heartbeat timeout is too low") + } + if config.ElectionTimeout < 5*time.Millisecond { + return fmt.Errorf("Election timeout is too low") + } + if config.CommitTimeout < time.Millisecond { + return fmt.Errorf("Commit timeout is too low") + } + if config.MaxAppendEntries <= 0 { + return fmt.Errorf("MaxAppendEntries must be positive") + } + if config.MaxAppendEntries > 1024 { + return fmt.Errorf("MaxAppendEntries is too large") + } + if config.SnapshotInterval < 5*time.Millisecond { + return fmt.Errorf("Snapshot interval is too low") + } + if config.LeaderLeaseTimeout < 5*time.Millisecond { + return fmt.Errorf("Leader lease timeout is too low") + } + if config.LeaderLeaseTimeout > config.HeartbeatTimeout { + return fmt.Errorf("Leader lease timeout cannot be larger than heartbeat timeout") + } + if config.ElectionTimeout < config.HeartbeatTimeout { + return fmt.Errorf("Election timeout must be equal or greater than Heartbeat Timeout") + } + return nil +} diff --git a/go/vt/orchestrator/external/raft/discard_snapshot.go b/go/vt/orchestrator/external/raft/discard_snapshot.go new file mode 100644 index 0000000000..1b4611d559 --- /dev/null +++ b/go/vt/orchestrator/external/raft/discard_snapshot.go @@ -0,0 +1,48 @@ +package raft + +import ( + "fmt" + "io" +) + +// DiscardSnapshotStore is used to successfully snapshot while +// always discarding the snapshot. This is useful for when the +// log should be truncated but no snapshot should be retained. +// This should never be used for production use, and is only +// suitable for testing. +type DiscardSnapshotStore struct{} + +type DiscardSnapshotSink struct{} + +// NewDiscardSnapshotStore is used to create a new DiscardSnapshotStore. +func NewDiscardSnapshotStore() *DiscardSnapshotStore { + return &DiscardSnapshotStore{} +} + +func (d *DiscardSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + return &DiscardSnapshotSink{}, nil +} + +func (d *DiscardSnapshotStore) List() ([]*SnapshotMeta, error) { + return nil, nil +} + +func (d *DiscardSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + return nil, nil, fmt.Errorf("open is not supported") +} + +func (d *DiscardSnapshotSink) Write(b []byte) (int, error) { + return len(b), nil +} + +func (d *DiscardSnapshotSink) Close() error { + return nil +} + +func (d *DiscardSnapshotSink) ID() string { + return "discard" +} + +func (d *DiscardSnapshotSink) Cancel() error { + return nil +} diff --git a/go/vt/orchestrator/external/raft/discard_snapshot_test.go b/go/vt/orchestrator/external/raft/discard_snapshot_test.go new file mode 100644 index 0000000000..5abedfe2c6 --- /dev/null +++ b/go/vt/orchestrator/external/raft/discard_snapshot_test.go @@ -0,0 +1,17 @@ +package raft + +import "testing" + +func TestDiscardSnapshotStoreImpl(t *testing.T) { + var impl interface{} = &DiscardSnapshotStore{} + if _, ok := impl.(SnapshotStore); !ok { + t.Fatalf("DiscardSnapshotStore not a SnapshotStore") + } +} + +func TestDiscardSnapshotSinkImpl(t *testing.T) { + var impl interface{} = &DiscardSnapshotSink{} + if _, ok := impl.(SnapshotSink); !ok { + t.Fatalf("DiscardSnapshotSink not a SnapshotSink") + } +} diff --git a/go/vt/orchestrator/external/raft/file_snapshot.go b/go/vt/orchestrator/external/raft/file_snapshot.go new file mode 100644 index 0000000000..4d841ba8de --- /dev/null +++ b/go/vt/orchestrator/external/raft/file_snapshot.go @@ -0,0 +1,479 @@ +package raft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "log" + "os" + "path/filepath" + "sort" + "strings" + "time" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int + logger *log.Logger +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + logger *log.Logger + dir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int, logger *log.Logger) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + logger: logger, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain, log.New(logOutput, "", log.LstdFlags)) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + + if err = fh.Close(); err != nil { + return err + } + + if err = os.Remove(path); err != nil { + return err + } + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (SnapshotSink, error) { + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + f.logger.Printf("[INFO] snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + logger: f.logger, + dir: path, + meta: fileSnapshotMeta{ + SnapshotMeta: SnapshotMeta{ + ID: name, + Index: index, + Term: term, + Peers: peers, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + f.logger.Printf("[WARN] snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + f.logger.Printf("[WARN] snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + f.logger.Printf("[ERR] snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + f.logger.Printf("[ERR] snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots() error { + snapshots, err := f.getSnapshots() + if err != nil { + f.logger.Printf("[ERR] snapshot: Failed to get snapshots: %v", err) + return err + } + + for i := f.retain; i < len(snapshots); i++ { + path := filepath.Join(f.path, snapshots[i].ID) + f.logger.Printf("[INFO] snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + f.logger.Printf("[ERR] snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to move snapshot into place: %v", err) + return err + } + + // Reap any old snapshots + if err := s.store.ReapSnapshots(); err != nil { + return err + } + + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + s.logger.Printf("[ERR] snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + defer buffered.Flush() + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/go/vt/orchestrator/external/raft/file_snapshot_test.go b/go/vt/orchestrator/external/raft/file_snapshot_test.go new file mode 100644 index 0000000000..fcd2ef4b8b --- /dev/null +++ b/go/vt/orchestrator/external/raft/file_snapshot_test.go @@ -0,0 +1,343 @@ +package raft + +import ( + "bytes" + "io" + "io/ioutil" + "os" + "runtime" + "testing" +) + +func FileSnapTest(t *testing.T) (string, *FileSnapshotStore) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + return dir, snap +} + +func TestFileSnapshotStoreImpl(t *testing.T) { + var impl interface{} = &FileSnapshotStore{} + if _, ok := impl.(SnapshotStore); !ok { + t.Fatalf("FileSnapshotStore not a SnapshotStore") + } +} + +func TestFileSnapshotSinkImpl(t *testing.T) { + var impl interface{} = &FileSnapshotSink{} + if _, ok := impl.(SnapshotSink); !ok { + t.Fatalf("FileSnapshotSink not a SnapshotSink") + } +} + +func TestFileSS_CreateSnapshotMissingParentDir(t *testing.T) { + parent, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(parent) + + dir, err := ioutil.TempDir(parent, "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + os.RemoveAll(parent) + peers := []byte("all my lovely friends") + _, err = snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("should not fail when using non existing parent") + } + +} +func TestFileSS_CreateSnapshot(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Check no snapshots + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } + + // Create a new sink + peers := []byte("all my lovely friends") + sink, err := snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + + // The sink is not done, should not be in a list! + snaps, err = snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } + + // Write to the sink + _, err = sink.Write([]byte("first\n")) + if err != nil { + t.Fatalf("err: %v", err) + } + _, err = sink.Write([]byte("second\n")) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Done! + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should have a snapshot! + snaps, err = snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 1 { + t.Fatalf("expect a snapshots: %v", snaps) + } + + // Check the latest + latest := snaps[0] + if latest.Index != 10 { + t.Fatalf("bad snapshot: %v", *latest) + } + if latest.Term != 3 { + t.Fatalf("bad snapshot: %v", *latest) + } + if bytes.Compare(latest.Peers, peers) != 0 { + t.Fatalf("bad snapshot: %v", *latest) + } + if latest.Size != 13 { + t.Fatalf("bad snapshot: %v", *latest) + } + + // Read the snapshot + _, r, err := snap.Open(latest.ID) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Read out everything + var buf bytes.Buffer + if _, err := io.Copy(&buf, r); err != nil { + t.Fatalf("err: %v", err) + } + if err := r.Close(); err != nil { + t.Fatalf("err: %v", err) + } + + // Ensure a match + if bytes.Compare(buf.Bytes(), []byte("first\nsecond\n")) != 0 { + t.Fatalf("content mismatch") + } +} + +func TestFileSS_CancelSnapshot(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + sink, err := snap.Create(10, 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Cancel the snapshot! Should delete + err = sink.Cancel() + if err != nil { + t.Fatalf("err: %v", err) + } + + // The sink is canceled, should not be in a list! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 0 { + t.Fatalf("did not expect any snapshots: %v", snaps) + } +} + +func TestFileSS_Retention(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 2, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + + // Create a few snapshots + for i := 10; i < 15; i++ { + sink, err := snap.Create(uint64(i), 3, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + } + + // Should only have 2 listed! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 2 { + t.Fatalf("expect 2 snapshots: %v", snaps) + } + + // Check they are the latest + if snaps[0].Index != 14 { + t.Fatalf("bad snap: %#v", *snaps[0]) + } + if snaps[1].Index != 13 { + t.Fatalf("bad snap: %#v", *snaps[1]) + } +} + +func TestFileSS_BadPerm(t *testing.T) { + if runtime.GOOS == "windows" { + t.Skip("skipping file permission test on windows") + } + + // Create a temp dir + dir1, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %s", err) + } + defer os.RemoveAll(dir1) + + // Create a sub dir and remove all permissions + dir2, err := ioutil.TempDir(dir1, "badperm") + if err != nil { + t.Fatalf("err: %s", err) + } + if err := os.Chmod(dir2, 000); err != nil { + t.Fatalf("err: %s", err) + } + defer os.Chmod(dir2, 777) // Set perms back for delete + + // Should fail + if _, err := NewFileSnapshotStore(dir2, 3, nil); err == nil { + t.Fatalf("should fail to use dir with bad perms") + } +} + +func TestFileSS_MissingParentDir(t *testing.T) { + parent, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(parent) + + dir, err := ioutil.TempDir(parent, "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + os.RemoveAll(parent) + _, err = NewFileSnapshotStore(dir, 3, nil) + if err != nil { + t.Fatalf("should not fail when using non existing parent") + } +} + +func TestFileSS_Ordering(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + snap, err := NewFileSnapshotStoreWithLogger(dir, 3, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Create a new sink + peers := []byte("all my lovely friends") + + sink, err := snap.Create(130350, 5, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + sink, err = snap.Create(204917, 36, peers) + if err != nil { + t.Fatalf("err: %v", err) + } + err = sink.Close() + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should only have 2 listed! + snaps, err := snap.List() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(snaps) != 2 { + t.Fatalf("expect 2 snapshots: %v", snaps) + } + + // Check they are ordered + if snaps[0].Term != 36 { + t.Fatalf("bad snap: %#v", *snaps[0]) + } + if snaps[1].Term != 5 { + t.Fatalf("bad snap: %#v", *snaps[1]) + } +} diff --git a/go/vt/orchestrator/external/raft/fsm.go b/go/vt/orchestrator/external/raft/fsm.go new file mode 100644 index 0000000000..ae52e9a7c1 --- /dev/null +++ b/go/vt/orchestrator/external/raft/fsm.go @@ -0,0 +1,40 @@ +package raft + +import ( + "io" +) + +// FSM provides an interface that can be implemented by +// clients to make use of the replicated log. +type FSM interface { + // Apply log is invoked once a log entry is committed. + // It returns a value which will be made available in the + // ApplyFuture returned by Raft.Apply method if that + // method was called on the same Raft node as the FSM. + Apply(*Log) interface{} + + // Snapshot is used to support log compaction. This call should + // return an FSMSnapshot which can be used to save a point-in-time + // snapshot of the FSM. Apply and Snapshot are not called in multiple + // threads, but Apply will be called concurrently with Persist. This means + // the FSM should be implemented in a fashion that allows for concurrent + // updates while a snapshot is happening. + Snapshot() (FSMSnapshot, error) + + // Restore is used to restore an FSM from a snapshot. It is not called + // concurrently with any other command. The FSM must discard all previous + // state. + Restore(io.ReadCloser) error +} + +// FSMSnapshot is returned by an FSM in response to a Snapshot +// It must be safe to invoke FSMSnapshot methods with concurrent +// calls to Apply. +type FSMSnapshot interface { + // Persist should dump all necessary state to the WriteCloser 'sink', + // and call sink.Close() when finished or call sink.Cancel() on error. + Persist(sink SnapshotSink) error + + // Release is invoked when we are finished with the snapshot. + Release() +} diff --git a/go/vt/orchestrator/external/raft/future.go b/go/vt/orchestrator/external/raft/future.go new file mode 100644 index 0000000000..b80f9090fa --- /dev/null +++ b/go/vt/orchestrator/external/raft/future.go @@ -0,0 +1,203 @@ +package raft + +import ( + "sync" + "time" +) + +// Future is used to represent an action that may occur in the future. +type Future interface { + // Error blocks until the future arrives and then + // returns the error status of the future. + // This may be called any number of times - all + // calls will return the same value. + // Note that it is not OK to call this method + // twice concurrently on the same Future instance. + Error() error +} + +// ApplyFuture is used for Apply() and can returns the FSM response. +type ApplyFuture interface { + Future + + // Response returns the FSM response as returned + // by the FSM.Apply method. This must not be called + // until after the Error method has returned. + Response() interface{} + + // Index holds the index of the newly applied log entry. + // This must not be called + // until after the Error method has returned. + Index() uint64 +} + +// errorFuture is used to return a static error. +type errorFuture struct { + err error +} + +func (e errorFuture) Error() error { + return e.err +} + +func (e errorFuture) Response() interface{} { + return nil +} + +func (e errorFuture) Index() uint64 { + return 0 +} + +// deferError can be embedded to allow a future +// to provide an error in the future. +type deferError struct { + err error + errCh chan error + responded bool +} + +func (d *deferError) init() { + d.errCh = make(chan error, 1) +} + +func (d *deferError) Error() error { + if d.err != nil { + // Note that when we've received a nil error, this + // won't trigger, but the channel is closed after + // send so we'll still return nil below. + return d.err + } + if d.errCh == nil { + panic("waiting for response on nil channel") + } + d.err = <-d.errCh + return d.err +} + +func (d *deferError) respond(err error) { + if d.errCh == nil { + return + } + if d.responded { + return + } + d.errCh <- err + close(d.errCh) + d.responded = true +} + +// logFuture is used to apply a log entry and waits until +// the log is considered committed. +type logFuture struct { + deferError + log Log + policy quorumPolicy + response interface{} + dispatch time.Time +} + +func (l *logFuture) Response() interface{} { + return l.response +} + +func (l *logFuture) Index() uint64 { + return l.log.Index +} + +type peerFuture struct { + deferError + peers []string +} + +type shutdownFuture struct { + raft *Raft +} + +func (s *shutdownFuture) Error() error { + if s.raft == nil { + return nil + } + s.raft.waitShutdown() + if closeable, ok := s.raft.trans.(WithClose); ok { + closeable.Close() + } + return nil +} + +// snapshotFuture is used for waiting on a snapshot to complete. +type snapshotFuture struct { + deferError +} + +// reqSnapshotFuture is used for requesting a snapshot start. +// It is only used internally. +type reqSnapshotFuture struct { + deferError + + // snapshot details provided by the FSM runner before responding + index uint64 + term uint64 + peers []string + snapshot FSMSnapshot +} + +// restoreFuture is used for requesting an FSM to perform a +// snapshot restore. Used internally only. +type restoreFuture struct { + deferError + ID string +} + +// verifyFuture is used to verify the current node is still +// the leader. This is to prevent a stale read. +type verifyFuture struct { + deferError + notifyCh chan *verifyFuture + quorumSize int + votes int + voteLock sync.Mutex +} + +// vote is used to respond to a verifyFuture. +// This may block when responding on the notifyCh. +func (v *verifyFuture) vote(leader bool) { + v.voteLock.Lock() + defer v.voteLock.Unlock() + + // Guard against having notified already + if v.notifyCh == nil { + return + } + + if leader { + v.votes++ + if v.votes >= v.quorumSize { + v.notifyCh <- v + v.notifyCh = nil + } + } else { + v.notifyCh <- v + v.notifyCh = nil + } +} + +// appendFuture is used for waiting on a pipelined append +// entries RPC. +type appendFuture struct { + deferError + start time.Time + args *AppendEntriesRequest + resp *AppendEntriesResponse +} + +func (a *appendFuture) Start() time.Time { + return a.start +} + +func (a *appendFuture) Request() *AppendEntriesRequest { + return a.args +} + +func (a *appendFuture) Response() *AppendEntriesResponse { + return a.resp +} diff --git a/go/vt/orchestrator/external/raft/future_test.go b/go/vt/orchestrator/external/raft/future_test.go new file mode 100644 index 0000000000..8bb958329f --- /dev/null +++ b/go/vt/orchestrator/external/raft/future_test.go @@ -0,0 +1,42 @@ +package raft + +import ( + "errors" + "testing" +) + +func TestDeferFutureSuccess(t *testing.T) { + var f deferError + f.init() + f.respond(nil) + if err := f.Error(); err != nil { + t.Fatalf("unexpected error result; got %#v want nil", err) + } + if err := f.Error(); err != nil { + t.Fatalf("unexpected error result; got %#v want nil", err) + } +} + +func TestDeferFutureError(t *testing.T) { + want := errors.New("x") + var f deferError + f.init() + f.respond(want) + if got := f.Error(); got != want { + t.Fatalf("unexpected error result; got %#v want %#v", got, want) + } + if got := f.Error(); got != want { + t.Fatalf("unexpected error result; got %#v want %#v", got, want) + } +} + +func TestDeferFutureConcurrent(t *testing.T) { + // Food for the race detector. + want := errors.New("x") + var f deferError + f.init() + go f.respond(want) + if got := f.Error(); got != want { + t.Errorf("unexpected error result; got %#v want %#v", got, want) + } +} diff --git a/go/vt/orchestrator/external/raft/inflight.go b/go/vt/orchestrator/external/raft/inflight.go new file mode 100644 index 0000000000..7014ff5039 --- /dev/null +++ b/go/vt/orchestrator/external/raft/inflight.go @@ -0,0 +1,213 @@ +package raft + +import ( + "container/list" + "sync" +) + +// QuorumPolicy allows individual logFutures to have different +// commitment rules while still using the inflight mechanism. +type quorumPolicy interface { + // Checks if a commit from a given peer is enough to + // satisfy the commitment rules + Commit() bool + + // Checks if a commit is committed + IsCommitted() bool +} + +// MajorityQuorum is used by Apply transactions and requires +// a simple majority of nodes. +type majorityQuorum struct { + count int + votesNeeded int +} + +func newMajorityQuorum(clusterSize int) *majorityQuorum { + votesNeeded := (clusterSize / 2) + 1 + return &majorityQuorum{count: 0, votesNeeded: votesNeeded} +} + +func (m *majorityQuorum) Commit() bool { + m.count++ + return m.count >= m.votesNeeded +} + +func (m *majorityQuorum) IsCommitted() bool { + return m.count >= m.votesNeeded +} + +// Inflight is used to track operations that are still in-flight. +type inflight struct { + sync.Mutex + committed *list.List + commitCh chan struct{} + minCommit uint64 + maxCommit uint64 + operations map[uint64]*logFuture + stopCh chan struct{} +} + +// NewInflight returns an inflight struct that notifies +// the provided channel when logs are finished committing. +func newInflight(commitCh chan struct{}) *inflight { + return &inflight{ + committed: list.New(), + commitCh: commitCh, + minCommit: 0, + maxCommit: 0, + operations: make(map[uint64]*logFuture), + stopCh: make(chan struct{}), + } +} + +// Start is used to mark a logFuture as being inflight. It +// also commits the entry, as it is assumed the leader is +// starting. +func (i *inflight) Start(l *logFuture) { + i.Lock() + defer i.Unlock() + i.start(l) +} + +// StartAll is used to mark a list of logFuture's as being +// inflight. It also commits each entry as the leader is +// assumed to be starting. +func (i *inflight) StartAll(logs []*logFuture) { + i.Lock() + defer i.Unlock() + for _, l := range logs { + i.start(l) + } +} + +// start is used to mark a single entry as inflight, +// must be invoked with the lock held. +func (i *inflight) start(l *logFuture) { + idx := l.log.Index + i.operations[idx] = l + + if idx > i.maxCommit { + i.maxCommit = idx + } + if i.minCommit == 0 { + i.minCommit = idx + } + i.commit(idx) +} + +// Cancel is used to cancel all in-flight operations. +// This is done when the leader steps down, and all futures +// are sent the given error. +func (i *inflight) Cancel(err error) { + // Close the channel first to unblock any pending commits + close(i.stopCh) + + // Lock after close to avoid deadlock + i.Lock() + defer i.Unlock() + + // Respond to all inflight operations + for _, op := range i.operations { + op.respond(err) + } + + // Clear all the committed but not processed + for e := i.committed.Front(); e != nil; e = e.Next() { + e.Value.(*logFuture).respond(err) + } + + // Clear the map + i.operations = make(map[uint64]*logFuture) + + // Clear the list of committed + i.committed = list.New() + + // Close the commmitCh + close(i.commitCh) + + // Reset indexes + i.minCommit = 0 + i.maxCommit = 0 +} + +// Committed returns all the committed operations in order. +func (i *inflight) Committed() (l *list.List) { + i.Lock() + l, i.committed = i.committed, list.New() + i.Unlock() + return l +} + +// Commit is used by leader replication routines to indicate that +// a follower was finished committing a log to disk. +func (i *inflight) Commit(index uint64) { + i.Lock() + defer i.Unlock() + i.commit(index) +} + +// CommitRange is used to commit a range of indexes inclusively. +// It is optimized to avoid commits for indexes that are not tracked. +func (i *inflight) CommitRange(minIndex, maxIndex uint64) { + i.Lock() + defer i.Unlock() + + // Update the minimum index + minIndex = max(i.minCommit, minIndex) + + // Commit each index + for idx := minIndex; idx <= maxIndex; idx++ { + i.commit(idx) + } +} + +// commit is used to commit a single index. Must be called with the lock held. +func (i *inflight) commit(index uint64) { + op, ok := i.operations[index] + if !ok { + // Ignore if not in the map, as it may be committed already + return + } + + // Check if we've satisfied the commit + if !op.policy.Commit() { + return + } + + // Cannot commit if this is not the minimum inflight. This can happen + // if the quorum size changes, meaning a previous commit requires a larger + // quorum that this commit. We MUST block until the previous log is committed, + // otherwise logs will be applied out of order. + if index != i.minCommit { + return + } + +NOTIFY: + // Add the operation to the committed list + i.committed.PushBack(op) + + // Stop tracking since it is committed + delete(i.operations, index) + + // Update the indexes + if index == i.maxCommit { + i.minCommit = 0 + i.maxCommit = 0 + + } else { + i.minCommit++ + } + + // Check if the next in-flight operation is ready + if i.minCommit != 0 { + op = i.operations[i.minCommit] + if op.policy.IsCommitted() { + index = i.minCommit + goto NOTIFY + } + } + + // Async notify of ready operations + asyncNotifyCh(i.commitCh) +} diff --git a/go/vt/orchestrator/external/raft/inflight_test.go b/go/vt/orchestrator/external/raft/inflight_test.go new file mode 100644 index 0000000000..a9f57d6ead --- /dev/null +++ b/go/vt/orchestrator/external/raft/inflight_test.go @@ -0,0 +1,150 @@ +package raft + +import ( + "fmt" + "testing" +) + +func TestInflight_StartCommit(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a transaction as being in flight + l := &logFuture{log: Log{Index: 1}} + l.policy = newMajorityQuorum(5) + in.Start(l) + + // Commit 3 times + in.Commit(1) + if in.Committed().Len() != 0 { + t.Fatalf("should not be commited") + } + + in.Commit(1) + if in.Committed().Len() != 1 { + t.Fatalf("should be commited") + } + + // Already committed but should work anyways + in.Commit(1) +} + +func TestInflight_Cancel(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a transaction as being in flight + l := &logFuture{ + log: Log{Index: 1}, + } + l.init() + l.policy = newMajorityQuorum(3) + in.Start(l) + + // Cancel with an error + err := fmt.Errorf("error 1") + in.Cancel(err) + + // Should get an error return + if l.Error() != err { + t.Fatalf("expected error") + } +} + +func TestInflight_StartAll(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + l3 := &logFuture{log: Log{Index: 4}} + l3.policy = newMajorityQuorum(5) + + // Start all the entries + in.StartAll([]*logFuture{l1, l2, l3}) + + // Commit ranges + in.CommitRange(1, 5) + in.CommitRange(1, 4) + in.CommitRange(1, 10) + + // Should get 3 back + if in.Committed().Len() != 3 { + t.Fatalf("expected all 3 to commit") + } +} + +func TestInflight_CommitRange(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + in.Start(l1) + + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + in.Start(l2) + + l3 := &logFuture{log: Log{Index: 4}} + l3.policy = newMajorityQuorum(5) + in.Start(l3) + + // Commit ranges + in.CommitRange(1, 5) + in.CommitRange(1, 4) + in.CommitRange(1, 10) + + // Should get 3 back + if in.Committed().Len() != 3 { + t.Fatalf("expected all 3 to commit") + } +} + +// Should panic if we commit non contiguously! +func TestInflight_NonContiguous(t *testing.T) { + commitCh := make(chan struct{}, 1) + in := newInflight(commitCh) + + // Commit a few transaction as being in flight + l1 := &logFuture{log: Log{Index: 2}} + l1.policy = newMajorityQuorum(5) + in.Start(l1) + + l2 := &logFuture{log: Log{Index: 3}} + l2.policy = newMajorityQuorum(5) + in.Start(l2) + + in.Commit(3) + in.Commit(3) + in.Commit(3) // panic! + + if in.Committed().Len() != 0 { + t.Fatalf("should not commit") + } + + in.Commit(2) + in.Commit(2) + in.Commit(2) // panic! + + committed := in.Committed() + if committed.Len() != 2 { + t.Fatalf("should commit both") + } + + current := committed.Front() + l := current.Value.(*logFuture) + if l.log.Index != 2 { + t.Fatalf("bad: %v", *l) + } + + current = current.Next() + l = current.Value.(*logFuture) + if l.log.Index != 3 { + t.Fatalf("bad: %v", *l) + } +} diff --git a/go/vt/orchestrator/external/raft/inmem_store.go b/go/vt/orchestrator/external/raft/inmem_store.go new file mode 100644 index 0000000000..6e4dfd020f --- /dev/null +++ b/go/vt/orchestrator/external/raft/inmem_store.go @@ -0,0 +1,116 @@ +package raft + +import ( + "sync" +) + +// InmemStore implements the LogStore and StableStore interface. +// It should NOT EVER be used for production. It is used only for +// unit tests. Use the MDBStore implementation instead. +type InmemStore struct { + l sync.RWMutex + lowIndex uint64 + highIndex uint64 + logs map[uint64]*Log + kv map[string][]byte + kvInt map[string]uint64 +} + +// NewInmemStore returns a new in-memory backend. Do not ever +// use for production. Only for testing. +func NewInmemStore() *InmemStore { + i := &InmemStore{ + logs: make(map[uint64]*Log), + kv: make(map[string][]byte), + kvInt: make(map[string]uint64), + } + return i +} + +// FirstIndex implements the LogStore interface. +func (i *InmemStore) FirstIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.lowIndex, nil +} + +// LastIndex implements the LogStore interface. +func (i *InmemStore) LastIndex() (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.highIndex, nil +} + +// GetLog implements the LogStore interface. +func (i *InmemStore) GetLog(index uint64, log *Log) error { + i.l.RLock() + defer i.l.RUnlock() + l, ok := i.logs[index] + if !ok { + return ErrLogNotFound + } + *log = *l + return nil +} + +// StoreLog implements the LogStore interface. +func (i *InmemStore) StoreLog(log *Log) error { + return i.StoreLogs([]*Log{log}) +} + +// StoreLogs implements the LogStore interface. +func (i *InmemStore) StoreLogs(logs []*Log) error { + i.l.Lock() + defer i.l.Unlock() + for _, l := range logs { + i.logs[l.Index] = l + if i.lowIndex == 0 { + i.lowIndex = l.Index + } + if l.Index > i.highIndex { + i.highIndex = l.Index + } + } + return nil +} + +// DeleteRange implements the LogStore interface. +func (i *InmemStore) DeleteRange(min, max uint64) error { + i.l.Lock() + defer i.l.Unlock() + for j := min; j <= max; j++ { + delete(i.logs, j) + } + i.lowIndex = max + 1 + return nil +} + +// Set implements the StableStore interface. +func (i *InmemStore) Set(key []byte, val []byte) error { + i.l.Lock() + defer i.l.Unlock() + i.kv[string(key)] = val + return nil +} + +// Get implements the StableStore interface. +func (i *InmemStore) Get(key []byte) ([]byte, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kv[string(key)], nil +} + +// SetUint64 implements the StableStore interface. +func (i *InmemStore) SetUint64(key []byte, val uint64) error { + i.l.Lock() + defer i.l.Unlock() + i.kvInt[string(key)] = val + return nil +} + +// GetUint64 implements the StableStore interface. +func (i *InmemStore) GetUint64(key []byte) (uint64, error) { + i.l.RLock() + defer i.l.RUnlock() + return i.kvInt[string(key)], nil +} diff --git a/go/vt/orchestrator/external/raft/inmem_transport.go b/go/vt/orchestrator/external/raft/inmem_transport.go new file mode 100644 index 0000000000..2d5f319069 --- /dev/null +++ b/go/vt/orchestrator/external/raft/inmem_transport.go @@ -0,0 +1,324 @@ +package raft + +import ( + "fmt" + "io" + "sync" + "time" +) + +// NewInmemAddr returns a new in-memory addr with +// a randomly generate UUID as the ID. +func NewInmemAddr() string { + return generateUUID() +} + +// inmemPipeline is used to pipeline requests for the in-mem transport. +type inmemPipeline struct { + trans *InmemTransport + peer *InmemTransport + peerAddr string + + doneCh chan AppendFuture + inprogressCh chan *inmemPipelineInflight + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +type inmemPipelineInflight struct { + future *appendFuture + respCh <-chan RPCResponse +} + +// InmemTransport Implements the Transport interface, to allow Raft to be +// tested in-memory without going over a network. +type InmemTransport struct { + sync.RWMutex + consumerCh chan RPC + localAddr string + peers map[string]*InmemTransport + pipelines []*inmemPipeline + timeout time.Duration +} + +// NewInmemTransport is used to initialize a new transport +// and generates a random local address if none is specified +func NewInmemTransport(addr string) (string, *InmemTransport) { + if addr == "" { + addr = NewInmemAddr() + } + trans := &InmemTransport{ + consumerCh: make(chan RPC, 16), + localAddr: addr, + peers: make(map[string]*InmemTransport), + timeout: 50 * time.Millisecond, + } + return addr, trans +} + +// SetHeartbeatHandler is used to set optional fast-path for +// heartbeats, not supported for this transport. +func (i *InmemTransport) SetHeartbeatHandler(cb func(RPC)) { +} + +// Consumer implements the Transport interface. +func (i *InmemTransport) Consumer() <-chan RPC { + return i.consumerCh +} + +// LocalAddr implements the Transport interface. +func (i *InmemTransport) LocalAddr() string { + return i.localAddr +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (i *InmemTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + if !ok { + return nil, fmt.Errorf("failed to connect to peer: %v", target) + } + pipeline := newInmemPipeline(i, peer, target) + i.Lock() + i.pipelines = append(i.pipelines, pipeline) + i.Unlock() + return pipeline, nil +} + +// AppendEntries implements the Transport interface. +func (i *InmemTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*AppendEntriesResponse) + *resp = *out + return nil +} + +// RequestVote implements the Transport interface. +func (i *InmemTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + rpcResp, err := i.makeRPC(target, args, nil, i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*RequestVoteResponse) + *resp = *out + return nil +} + +// InstallSnapshot implements the Transport interface. +func (i *InmemTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + rpcResp, err := i.makeRPC(target, args, data, 10*i.timeout) + if err != nil { + return err + } + + // Copy the result back + out := rpcResp.Response.(*InstallSnapshotResponse) + *resp = *out + return nil +} + +func (i *InmemTransport) makeRPC(target string, args interface{}, r io.Reader, timeout time.Duration) (rpcResp RPCResponse, err error) { + i.RLock() + peer, ok := i.peers[target] + i.RUnlock() + + if !ok { + err = fmt.Errorf("failed to connect to peer: %v", target) + return + } + + // Send the RPC over + respCh := make(chan RPCResponse) + peer.consumerCh <- RPC{ + Command: args, + Reader: r, + RespChan: respCh, + } + + // Wait for a response + select { + case rpcResp = <-respCh: + if rpcResp.Error != nil { + err = rpcResp.Error + } + case <-time.After(timeout): + err = fmt.Errorf("command timed out") + } + return +} + +// EncodePeer implements the Transport interface. It uses the UUID as the +// address directly. +func (i *InmemTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. It wraps the UUID in an +// InmemAddr. +func (i *InmemTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// Connect is used to connect this transport to another transport for +// a given peer name. This allows for local routing. +func (i *InmemTransport) Connect(peer string, t Transport) { + trans := t.(*InmemTransport) + i.Lock() + defer i.Unlock() + i.peers[peer] = trans +} + +// Disconnect is used to remove the ability to route to a given peer. +func (i *InmemTransport) Disconnect(peer string) { + i.Lock() + defer i.Unlock() + delete(i.peers, peer) + + // Disconnect any pipelines + n := len(i.pipelines) + for idx := 0; idx < n; idx++ { + if i.pipelines[idx].peerAddr == peer { + i.pipelines[idx].Close() + i.pipelines[idx], i.pipelines[n-1] = i.pipelines[n-1], nil + idx-- + n-- + } + } + i.pipelines = i.pipelines[:n] +} + +// DisconnectAll is used to remove all routes to peers. +func (i *InmemTransport) DisconnectAll() { + i.Lock() + defer i.Unlock() + i.peers = make(map[string]*InmemTransport) + + // Handle pipelines + for _, pipeline := range i.pipelines { + pipeline.Close() + } + i.pipelines = nil +} + +// Close is used to permanently disable the transport +func (i *InmemTransport) Close() error { + i.DisconnectAll() + return nil +} + +func newInmemPipeline(trans *InmemTransport, peer *InmemTransport, addr string) *inmemPipeline { + i := &inmemPipeline{ + trans: trans, + peer: peer, + peerAddr: addr, + doneCh: make(chan AppendFuture, 16), + inprogressCh: make(chan *inmemPipelineInflight, 16), + shutdownCh: make(chan struct{}), + } + go i.decodeResponses() + return i +} + +func (i *inmemPipeline) decodeResponses() { + timeout := i.trans.timeout + for { + select { + case inp := <-i.inprogressCh: + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + + select { + case rpcResp := <-inp.respCh: + // Copy the result back + *inp.future.resp = *rpcResp.Response.(*AppendEntriesResponse) + inp.future.respond(rpcResp.Error) + + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-timeoutCh: + inp.future.respond(fmt.Errorf("command timed out")) + select { + case i.doneCh <- inp.future: + case <-i.shutdownCh: + return + } + + case <-i.shutdownCh: + return + } + case <-i.shutdownCh: + return + } + } +} + +func (i *inmemPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Handle a timeout + var timeout <-chan time.Time + if i.trans.timeout > 0 { + timeout = time.After(i.trans.timeout) + } + + // Send the RPC over + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + Command: args, + RespChan: respCh, + } + select { + case i.peer.consumerCh <- rpc: + case <-timeout: + return nil, fmt.Errorf("command enqueue timeout") + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } + + // Send to be decoded + select { + case i.inprogressCh <- &inmemPipelineInflight{future, respCh}: + return future, nil + case <-i.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +func (i *inmemPipeline) Consumer() <-chan AppendFuture { + return i.doneCh +} + +func (i *inmemPipeline) Close() error { + i.shutdownLock.Lock() + defer i.shutdownLock.Unlock() + if i.shutdown { + return nil + } + + i.shutdown = true + close(i.shutdownCh) + return nil +} diff --git a/go/vt/orchestrator/external/raft/inmem_transport_test.go b/go/vt/orchestrator/external/raft/inmem_transport_test.go new file mode 100644 index 0000000000..82c95348a5 --- /dev/null +++ b/go/vt/orchestrator/external/raft/inmem_transport_test.go @@ -0,0 +1,18 @@ +package raft + +import ( + "testing" +) + +func TestInmemTransportImpl(t *testing.T) { + var inm interface{} = &InmemTransport{} + if _, ok := inm.(Transport); !ok { + t.Fatalf("InmemTransport is not a Transport") + } + if _, ok := inm.(LoopbackTransport); !ok { + t.Fatalf("InmemTransport is not a Loopback Transport") + } + if _, ok := inm.(WithPeers); !ok { + t.Fatalf("InmemTransport is not a WithPeers Transport") + } +} diff --git a/go/vt/orchestrator/external/raft/integ_test.go b/go/vt/orchestrator/external/raft/integ_test.go new file mode 100644 index 0000000000..c4bf67a7ac --- /dev/null +++ b/go/vt/orchestrator/external/raft/integ_test.go @@ -0,0 +1,268 @@ +package raft + +import ( + "bytes" + "fmt" + "io/ioutil" + "log" + "os" + "testing" + "time" +) + +// CheckInteg will skip a test if integration testing is not enabled. +func CheckInteg(t *testing.T) { + if !IsInteg() { + t.SkipNow() + } +} + +// IsInteg returns a boolean telling you if we're in integ testing mode. +func IsInteg() bool { + return os.Getenv("INTEG_TESTS") != "" +} + +type RaftEnv struct { + dir string + conf *Config + fsm *MockFSM + store *InmemStore + snapshot *FileSnapshotStore + peers *JSONPeers + trans *NetworkTransport + raft *Raft + logger *log.Logger +} + +func (r *RaftEnv) Release() { + r.logger.Printf("[WARN] Release node at %v", r.raft.localAddr) + f := r.raft.Shutdown() + if err := f.Error(); err != nil { + panic(err) + } + r.trans.Close() + os.RemoveAll(r.dir) +} + +func MakeRaft(t *testing.T, conf *Config) *RaftEnv { + // Set the config + if conf == nil { + conf = inmemConfig(t) + } + + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + + stable := NewInmemStore() + + snap, err := NewFileSnapshotStore(dir, 3, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + + env := &RaftEnv{ + conf: conf, + dir: dir, + store: stable, + snapshot: snap, + fsm: &MockFSM{}, + logger: log.New(&testLoggerAdapter{t: t}, "", log.Lmicroseconds), + } + + trans, err := NewTCPTransport("127.0.0.1:0", nil, 2, time.Second, nil) + if err != nil { + t.Fatalf("err: %v", err) + } + env.trans = trans + + env.peers = NewJSONPeers(dir, trans) + + env.logger.Printf("[INFO] Starting node at %v", trans.LocalAddr()) + raft, err := NewRaft(conf, env.fsm, stable, stable, snap, env.peers, trans) + if err != nil { + t.Fatalf("err: %v", err) + } + env.raft = raft + return env +} + +func WaitFor(env *RaftEnv, state RaftState) error { + limit := time.Now().Add(200 * time.Millisecond) + for env.raft.State() != state { + if time.Now().Before(limit) { + time.Sleep(10 * time.Millisecond) + } else { + return fmt.Errorf("failed to transition to state %v", state) + } + } + return nil +} + +func WaitForAny(state RaftState, envs []*RaftEnv) (*RaftEnv, error) { + limit := time.Now().Add(200 * time.Millisecond) +CHECK: + for _, env := range envs { + if env.raft.State() == state { + return env, nil + } + } + if time.Now().Before(limit) { + goto WAIT + } + return nil, fmt.Errorf("failed to find node in %v state", state) +WAIT: + time.Sleep(10 * time.Millisecond) + goto CHECK +} + +func WaitFuture(f Future, t *testing.T) error { + timer := time.AfterFunc(200*time.Millisecond, func() { + panic(fmt.Errorf("timeout waiting for future %v", f)) + }) + defer timer.Stop() + return f.Error() +} + +func NoErr(err error, t *testing.T) { + if err != nil { + t.Fatalf("err: %v", err) + } +} + +func CheckConsistent(envs []*RaftEnv, t *testing.T) { + limit := time.Now().Add(400 * time.Millisecond) + first := envs[0] + var err error +CHECK: + l1 := len(first.fsm.logs) + for i := 1; i < len(envs); i++ { + env := envs[i] + l2 := len(env.fsm.logs) + if l1 != l2 { + err = fmt.Errorf("log length mismatch %d %d", l1, l2) + goto ERR + } + for idx, log := range first.fsm.logs { + other := env.fsm.logs[idx] + if bytes.Compare(log, other) != 0 { + err = fmt.Errorf("log %d mismatch %v %v", idx, log, other) + goto ERR + } + } + } + return +ERR: + if time.Now().After(limit) { + t.Fatalf("%v", err) + } + time.Sleep(20 * time.Millisecond) + goto CHECK +} + +// Tests Raft by creating a cluster, growing it to 5 nodes while +// causing various stressful conditions +func TestRaft_Integ(t *testing.T) { + CheckInteg(t) + conf := DefaultConfig() + conf.HeartbeatTimeout = 50 * time.Millisecond + conf.ElectionTimeout = 50 * time.Millisecond + conf.LeaderLeaseTimeout = 50 * time.Millisecond + conf.CommitTimeout = 5 * time.Millisecond + conf.SnapshotThreshold = 100 + conf.TrailingLogs = 10 + conf.EnableSingleNode = true + + // Create a single node + env1 := MakeRaft(t, conf) + NoErr(WaitFor(env1, Leader), t) + + // Do some commits + var futures []Future + for i := 0; i < 100; i++ { + futures = append(futures, env1.raft.Apply([]byte(fmt.Sprintf("test%d", i)), 0)) + } + for _, f := range futures { + NoErr(WaitFuture(f, t), t) + env1.logger.Printf("[DEBUG] Applied %v", f) + } + + // Do a snapshot + NoErr(WaitFuture(env1.raft.Snapshot(), t), t) + + // Join a few nodes! + var envs []*RaftEnv + for i := 0; i < 4; i++ { + env := MakeRaft(t, conf) + addr := env.trans.LocalAddr() + NoErr(WaitFuture(env1.raft.AddPeer(addr), t), t) + envs = append(envs, env) + } + + // Wait for a leader + leader, err := WaitForAny(Leader, append([]*RaftEnv{env1}, envs...)) + NoErr(err, t) + + // Do some more commits + futures = nil + for i := 0; i < 100; i++ { + futures = append(futures, leader.raft.Apply([]byte(fmt.Sprintf("test%d", i)), 0)) + } + for _, f := range futures { + NoErr(WaitFuture(f, t), t) + leader.logger.Printf("[DEBUG] Applied %v", f) + } + + // Shoot two nodes in the head! + rm1, rm2 := envs[0], envs[1] + rm1.Release() + rm2.Release() + envs = envs[2:] + time.Sleep(10 * time.Millisecond) + + // Wait for a leader + leader, err = WaitForAny(Leader, append([]*RaftEnv{env1}, envs...)) + NoErr(err, t) + + // Do some more commits + futures = nil + for i := 0; i < 100; i++ { + futures = append(futures, leader.raft.Apply([]byte(fmt.Sprintf("test%d", i)), 0)) + } + for _, f := range futures { + NoErr(WaitFuture(f, t), t) + leader.logger.Printf("[DEBUG] Applied %v", f) + } + + // Join a few new nodes! + for i := 0; i < 2; i++ { + env := MakeRaft(t, conf) + addr := env.trans.LocalAddr() + NoErr(WaitFuture(leader.raft.AddPeer(addr), t), t) + envs = append(envs, env) + } + + // Remove the old nodes + NoErr(WaitFuture(leader.raft.RemovePeer(rm1.raft.localAddr), t), t) + NoErr(WaitFuture(leader.raft.RemovePeer(rm2.raft.localAddr), t), t) + + // Shoot the leader + env1.Release() + time.Sleep(3 * conf.HeartbeatTimeout) + + // Wait for a leader + leader, err = WaitForAny(Leader, envs) + NoErr(err, t) + + allEnvs := append([]*RaftEnv{env1}, envs...) + CheckConsistent(allEnvs, t) + + if len(env1.fsm.logs) != 300 { + t.Fatalf("should apply 300 logs! %d", len(env1.fsm.logs)) + } + + for _, e := range envs { + e.Release() + } +} diff --git a/go/vt/orchestrator/external/raft/log.go b/go/vt/orchestrator/external/raft/log.go new file mode 100644 index 0000000000..9399154ab4 --- /dev/null +++ b/go/vt/orchestrator/external/raft/log.go @@ -0,0 +1,67 @@ +package raft + +// LogType describes various types of log entries. +type LogType uint8 + +const ( + // LogCommand is applied to a user FSM. + LogCommand LogType = iota + + // LogNoop is used to assert leadership. + LogNoop + + // LogAddPeer is used to add a new peer. + LogAddPeer + + // LogRemovePeer is used to remove an existing peer. + LogRemovePeer + + // LogBarrier is used to ensure all preceding operations have been + // applied to the FSM. It is similar to LogNoop, but instead of returning + // once committed, it only returns once the FSM manager acks it. Otherwise + // it is possible there are operations committed but not yet applied to + // the FSM. + LogBarrier +) + +// Log entries are replicated to all members of the Raft cluster +// and form the heart of the replicated state machine. +type Log struct { + // Index holds the index of the log entry. + Index uint64 + + // Term holds the election term of the log entry. + Term uint64 + + // Type holds the type of the log entry. + Type LogType + + // Data holds the log entry's type-specific data. + Data []byte + + // peer is not exported since it is not transmitted, only used + // internally to construct the Data field. + peer string +} + +// LogStore is used to provide an interface for storing +// and retrieving logs in a durable fashion. +type LogStore interface { + // FirstIndex returns the first index written. 0 for no entries. + FirstIndex() (uint64, error) + + // LastIndex returns the last index written. 0 for no entries. + LastIndex() (uint64, error) + + // GetLog gets a log entry at a given index. + GetLog(index uint64, log *Log) error + + // StoreLog stores a log entry. + StoreLog(log *Log) error + + // StoreLogs stores multiple log entries. + StoreLogs(logs []*Log) error + + // DeleteRange deletes a range of log entries. The range is inclusive. + DeleteRange(min, max uint64) error +} diff --git a/go/vt/orchestrator/external/raft/log_cache.go b/go/vt/orchestrator/external/raft/log_cache.go new file mode 100644 index 0000000000..952e98c228 --- /dev/null +++ b/go/vt/orchestrator/external/raft/log_cache.go @@ -0,0 +1,79 @@ +package raft + +import ( + "fmt" + "sync" +) + +// LogCache wraps any LogStore implementation to provide an +// in-memory ring buffer. This is used to cache access to +// the recently written entries. For implementations that do not +// cache themselves, this can provide a substantial boost by +// avoiding disk I/O on recent entries. +type LogCache struct { + store LogStore + + cache []*Log + l sync.RWMutex +} + +// NewLogCache is used to create a new LogCache with the +// given capacity and backend store. +func NewLogCache(capacity int, store LogStore) (*LogCache, error) { + if capacity <= 0 { + return nil, fmt.Errorf("capacity must be positive") + } + c := &LogCache{ + store: store, + cache: make([]*Log, capacity), + } + return c, nil +} + +func (c *LogCache) GetLog(idx uint64, log *Log) error { + // Check the buffer for an entry + c.l.RLock() + cached := c.cache[idx%uint64(len(c.cache))] + c.l.RUnlock() + + // Check if entry is valid + if cached != nil && cached.Index == idx { + *log = *cached + return nil + } + + // Forward request on cache miss + return c.store.GetLog(idx, log) +} + +func (c *LogCache) StoreLog(log *Log) error { + return c.StoreLogs([]*Log{log}) +} + +func (c *LogCache) StoreLogs(logs []*Log) error { + // Insert the logs into the ring buffer + c.l.Lock() + for _, l := range logs { + c.cache[l.Index%uint64(len(c.cache))] = l + } + c.l.Unlock() + + return c.store.StoreLogs(logs) +} + +func (c *LogCache) FirstIndex() (uint64, error) { + return c.store.FirstIndex() +} + +func (c *LogCache) LastIndex() (uint64, error) { + return c.store.LastIndex() +} + +func (c *LogCache) DeleteRange(min, max uint64) error { + // Invalidate the cache on deletes + c.l.Lock() + c.cache = make([]*Log, len(c.cache)) + c.l.Unlock() + + return c.store.DeleteRange(min, max) +} diff --git a/go/vt/orchestrator/external/raft/log_cache_test.go b/go/vt/orchestrator/external/raft/log_cache_test.go new file mode 100644 index 0000000000..7569e78ee7 --- /dev/null +++ b/go/vt/orchestrator/external/raft/log_cache_test.go @@ -0,0 +1,88 @@ +package raft + +import ( + "testing" +) + +func TestLogCache(t *testing.T) { + store := NewInmemStore() + c, _ := NewLogCache(16, store) + + // Insert into the in-mem store + for i := 0; i < 32; i++ { + log := &Log{Index: uint64(i) + 1} + store.StoreLog(log) + } + + // Check the indexes + if idx, _ := c.FirstIndex(); idx != 1 { + t.Fatalf("bad: %d", idx) + } + if idx, _ := c.LastIndex(); idx != 32 { + t.Fatalf("bad: %d", idx) + } + + // Try get log with a miss + var out Log + err := c.GetLog(1, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + if out.Index != 1 { + t.Fatalf("bad: %#v", out) + } + + // Store logs + l1 := &Log{Index: 33} + l2 := &Log{Index: 34} + err = c.StoreLogs([]*Log{l1, l2}) + if err != nil { + t.Fatalf("err: %v", err) + } + + if idx, _ := c.LastIndex(); idx != 34 { + t.Fatalf("bad: %d", idx) + } + + // Check that it wrote-through + err = store.GetLog(33, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + err = store.GetLog(34, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Delete in the backend + err = store.DeleteRange(33, 34) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should be in the ring buffer + err = c.GetLog(33, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + err = c.GetLog(34, &out) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Purge the ring buffer + err = c.DeleteRange(33, 34) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Should not be in the ring buffer + err = c.GetLog(33, &out) + if err != ErrLogNotFound { + t.Fatalf("err: %v", err) + } + err = c.GetLog(34, &out) + if err != ErrLogNotFound { + t.Fatalf("err: %v", err) + } +} diff --git a/go/vt/orchestrator/external/raft/net_transport.go b/go/vt/orchestrator/external/raft/net_transport.go new file mode 100644 index 0000000000..3597f9df3f --- /dev/null +++ b/go/vt/orchestrator/external/raft/net_transport.go @@ -0,0 +1,623 @@ +package raft + +import ( + "bufio" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "sync" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +const ( + rpcAppendEntries uint8 = iota + rpcRequestVote + rpcInstallSnapshot + + // DefaultTimeoutScale is the default TimeoutScale in a NetworkTransport. + DefaultTimeoutScale = 256 * 1024 // 256KB + + // rpcMaxPipeline controls the maximum number of outstanding + // AppendEntries RPC calls. + rpcMaxPipeline = 128 +) + +var ( + // ErrTransportShutdown is returned when operations on a transport are + // invoked after it's been terminated. + ErrTransportShutdown = errors.New("transport shutdown") + + // ErrPipelineShutdown is returned when the pipeline is closed. + ErrPipelineShutdown = errors.New("append pipeline closed") +) + +/* + +NetworkTransport provides a network based transport that can be +used to communicate with Raft on remote machines. It requires +an underlying stream layer to provide a stream abstraction, which can +be simple TCP, TLS, etc. + +This transport is very simple and lightweight. Each RPC request is +framed by sending a byte that indicates the message type, followed +by the MsgPack encoded request. + +The response is an error string followed by the response object, +both are encoded using MsgPack. + +InstallSnapshot is special, in that after the RPC request we stream +the entire state. That socket is not re-used as the connection state +is not known if there is an error. + +*/ +type NetworkTransport struct { + connPool map[string][]*netConn + connPoolLock sync.Mutex + + consumeCh chan RPC + + heartbeatFn func(RPC) + heartbeatFnLock sync.Mutex + + logger *log.Logger + + maxPool int + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + stream StreamLayer + + timeout time.Duration + TimeoutScale int +} + +// StreamLayer is used with the NetworkTransport to provide +// the low level stream abstraction. +type StreamLayer interface { + net.Listener + + // Dial is used to create a new outgoing connection + Dial(address string, timeout time.Duration) (net.Conn, error) +} + +type netConn struct { + target string + conn net.Conn + r *bufio.Reader + w *bufio.Writer + dec *codec.Decoder + enc *codec.Encoder +} + +func (n *netConn) Release() error { + return n.conn.Close() +} + +type netPipeline struct { + conn *netConn + trans *NetworkTransport + + doneCh chan AppendFuture + inprogressCh chan *appendFuture + + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex +} + +// NewNetworkTransport creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransport( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) *NetworkTransport { + if logOutput == nil { + logOutput = os.Stderr + } + return NewNetworkTransportWithLogger(stream, maxPool, timeout, log.New(logOutput, "", log.LstdFlags)) +} + +// NewNetworkTransportWithLogger creates a new network transport with the given dialer +// and listener. The maxPool controls how many connections we will pool. The +// timeout is used to apply I/O deadlines. For InstallSnapshot, we multiply +// the timeout by (SnapshotSize / TimeoutScale). +func NewNetworkTransportWithLogger( + stream StreamLayer, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) *NetworkTransport { + if logger == nil { + logger = log.New(os.Stderr, "", log.LstdFlags) + } + trans := &NetworkTransport{ + connPool: make(map[string][]*netConn), + consumeCh: make(chan RPC), + logger: logger, + maxPool: maxPool, + shutdownCh: make(chan struct{}), + stream: stream, + timeout: timeout, + TimeoutScale: DefaultTimeoutScale, + } + go trans.listen() + return trans +} + +// SetHeartbeatHandler is used to setup a heartbeat handler +// as a fast-pass. This is to avoid head-of-line blocking from +// disk IO. +func (n *NetworkTransport) SetHeartbeatHandler(cb func(rpc RPC)) { + n.heartbeatFnLock.Lock() + defer n.heartbeatFnLock.Unlock() + n.heartbeatFn = cb +} + +// Close is used to stop the network transport. +func (n *NetworkTransport) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + + if !n.shutdown { + close(n.shutdownCh) + n.stream.Close() + n.shutdown = true + } + return nil +} + +// Consumer implements the Transport interface. +func (n *NetworkTransport) Consumer() <-chan RPC { + return n.consumeCh +} + +// LocalAddr implements the Transport interface. +func (n *NetworkTransport) LocalAddr() string { + return n.stream.Addr().String() +} + +// IsShutdown is used to check if the transport is shutdown. +func (n *NetworkTransport) IsShutdown() bool { + select { + case <-n.shutdownCh: + return true + default: + return false + } +} + +// getExistingConn is used to grab a pooled connection. +func (n *NetworkTransport) getPooledConn(target string) *netConn { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + conns, ok := n.connPool[target] + if !ok || len(conns) == 0 { + return nil + } + + var conn *netConn + num := len(conns) + conn, conns[num-1] = conns[num-1], nil + n.connPool[target] = conns[:num-1] + return conn +} + +// getConn is used to get a connection from the pool. +func (n *NetworkTransport) getConn(target string) (*netConn, error) { + // Check for a pooled conn + if conn := n.getPooledConn(target); conn != nil { + return conn, nil + } + + // Dial a new connection + conn, err := n.stream.Dial(target, n.timeout) + if err != nil { + return nil, err + } + + // Wrap the conn + netConn := &netConn{ + target: target, + conn: conn, + r: bufio.NewReader(conn), + w: bufio.NewWriter(conn), + } + + // Setup encoder/decoders + netConn.dec = codec.NewDecoder(netConn.r, &codec.MsgpackHandle{}) + netConn.enc = codec.NewEncoder(netConn.w, &codec.MsgpackHandle{}) + + // Done + return netConn, nil +} + +// returnConn returns a connection back to the pool. +func (n *NetworkTransport) returnConn(conn *netConn) { + n.connPoolLock.Lock() + defer n.connPoolLock.Unlock() + + key := conn.target + conns, _ := n.connPool[key] + + if !n.IsShutdown() && len(conns) < n.maxPool { + n.connPool[key] = append(conns, conn) + } else { + conn.Release() + } +} + +// AppendEntriesPipeline returns an interface that can be used to pipeline +// AppendEntries requests. +func (n *NetworkTransport) AppendEntriesPipeline(target string) (AppendPipeline, error) { + // Get a connection + conn, err := n.getConn(target) + if err != nil { + return nil, err + } + + // Create the pipeline + return newNetPipeline(n, conn), nil +} + +// AppendEntries implements the Transport interface. +func (n *NetworkTransport) AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error { + return n.genericRPC(target, rpcAppendEntries, args, resp) +} + +// RequestVote implements the Transport interface. +func (n *NetworkTransport) RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error { + return n.genericRPC(target, rpcRequestVote, args, resp) +} + +// genericRPC handles a simple request/response RPC. +func (n *NetworkTransport) genericRPC(target string, rpcType uint8, args interface{}, resp interface{}) error { + // Get a conn + conn, err := n.getConn(target) + if err != nil { + return err + } + + // Set a deadline + if n.timeout > 0 { + conn.conn.SetDeadline(time.Now().Add(n.timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcType, args); err != nil { + return err + } + + // Decode the response + canReturn, err := decodeResponse(conn, resp) + if canReturn { + n.returnConn(conn) + } + return err +} + +// InstallSnapshot implements the Transport interface. +func (n *NetworkTransport) InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error { + // Get a conn, always close for InstallSnapshot + conn, err := n.getConn(target) + if err != nil { + return err + } + defer conn.Release() + + // Set a deadline, scaled by request size + if n.timeout > 0 { + timeout := n.timeout * time.Duration(args.Size/int64(n.TimeoutScale)) + if timeout < n.timeout { + timeout = n.timeout + } + conn.conn.SetDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err = sendRPC(conn, rpcInstallSnapshot, args); err != nil { + return err + } + + // Stream the state + if _, err = io.Copy(conn.w, data); err != nil { + return err + } + + // Flush + if err = conn.w.Flush(); err != nil { + return err + } + + // Decode the response, do not return conn + _, err = decodeResponse(conn, resp) + + return err +} + +// EncodePeer implements the Transport interface. +func (n *NetworkTransport) EncodePeer(p string) []byte { + return []byte(p) +} + +// DecodePeer implements the Transport interface. +func (n *NetworkTransport) DecodePeer(buf []byte) string { + return string(buf) +} + +// listen is used to handling incoming connections. +func (n *NetworkTransport) listen() { + for { + // Accept incoming connections + conn, err := n.stream.Accept() + if err != nil { + if n.IsShutdown() { + return + } + n.logger.Printf("[ERR] raft-net: Failed to accept connection: %v", err) + continue + } + n.logger.Printf("[DEBUG] raft-net: %v accepted connection from: %v", n.LocalAddr(), conn.RemoteAddr()) + + // Handle the connection in dedicated routine + go n.handleConn(conn) + } +} + +// handleConn is used to handle an inbound connection for its lifespan. +func (n *NetworkTransport) handleConn(conn net.Conn) { + defer conn.Close() + r := bufio.NewReader(conn) + w := bufio.NewWriter(conn) + dec := codec.NewDecoder(r, &codec.MsgpackHandle{}) + enc := codec.NewEncoder(w, &codec.MsgpackHandle{}) + + for { + if err := n.handleCommand(r, dec, enc); err != nil { + if err != io.EOF { + n.logger.Printf("[ERR] raft-net: Failed to decode incoming command: %v", err) + } + return + } + if err := w.Flush(); err != nil { + n.logger.Printf("[ERR] raft-net: Failed to flush response: %v", err) + return + } + } +} + +// handleCommand is used to decode and dispatch a single command. +func (n *NetworkTransport) handleCommand(r *bufio.Reader, dec *codec.Decoder, enc *codec.Encoder) error { + // Get the rpc type + rpcType, err := r.ReadByte() + if err != nil { + return err + } + + // Create the RPC object + respCh := make(chan RPCResponse, 1) + rpc := RPC{ + RespChan: respCh, + } + + // Decode the command + isHeartbeat := false + switch rpcType { + case rpcAppendEntries: + var req AppendEntriesRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + // Check if this is a heartbeat + if req.Term != 0 && req.Leader != nil && + req.PrevLogEntry == 0 && req.PrevLogTerm == 0 && + len(req.Entries) == 0 && req.LeaderCommitIndex == 0 { + isHeartbeat = true + } + + case rpcRequestVote: + var req RequestVoteRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + + case rpcInstallSnapshot: + var req InstallSnapshotRequest + if err := dec.Decode(&req); err != nil { + return err + } + rpc.Command = &req + rpc.Reader = io.LimitReader(r, req.Size) + + default: + return fmt.Errorf("unknown rpc type %d", rpcType) + } + + // Check for heartbeat fast-path + if isHeartbeat { + n.heartbeatFnLock.Lock() + fn := n.heartbeatFn + n.heartbeatFnLock.Unlock() + if fn != nil { + fn(rpc) + goto RESP + } + } + + // Dispatch the RPC + select { + case n.consumeCh <- rpc: + case <-n.shutdownCh: + return ErrTransportShutdown + } + + // Wait for response +RESP: + select { + case resp := <-respCh: + // Send the error first + respErr := "" + if resp.Error != nil { + respErr = resp.Error.Error() + } + if err := enc.Encode(respErr); err != nil { + return err + } + + // Send the response + if err := enc.Encode(resp.Response); err != nil { + return err + } + case <-n.shutdownCh: + return ErrTransportShutdown + } + return nil +} + +// decodeResponse is used to decode an RPC response and reports whether +// the connection can be reused. +func decodeResponse(conn *netConn, resp interface{}) (bool, error) { + // Decode the error if any + var rpcError string + if err := conn.dec.Decode(&rpcError); err != nil { + conn.Release() + return false, err + } + + // Decode the response + if err := conn.dec.Decode(resp); err != nil { + conn.Release() + return false, err + } + + // Format an error if any + if rpcError != "" { + return true, fmt.Errorf(rpcError) + } + return true, nil +} + +// sendRPC is used to encode and send the RPC. +func sendRPC(conn *netConn, rpcType uint8, args interface{}) error { + // Write the request type + if err := conn.w.WriteByte(rpcType); err != nil { + conn.Release() + return err + } + + // Send the request + if err := conn.enc.Encode(args); err != nil { + conn.Release() + return err + } + + // Flush + if err := conn.w.Flush(); err != nil { + conn.Release() + return err + } + return nil +} + +// newNetPipeline is used to construct a netPipeline from a given +// transport and connection. +func newNetPipeline(trans *NetworkTransport, conn *netConn) *netPipeline { + n := &netPipeline{ + conn: conn, + trans: trans, + doneCh: make(chan AppendFuture, rpcMaxPipeline), + inprogressCh: make(chan *appendFuture, rpcMaxPipeline), + shutdownCh: make(chan struct{}), + } + go n.decodeResponses() + return n +} + +// decodeResponses is a long running routine that decodes the responses +// sent on the connection. +func (n *netPipeline) decodeResponses() { + timeout := n.trans.timeout + for { + select { + case future := <-n.inprogressCh: + if timeout > 0 { + n.conn.conn.SetReadDeadline(time.Now().Add(timeout)) + } + + _, err := decodeResponse(n.conn, future.resp) + future.respond(err) + select { + case n.doneCh <- future: + case <-n.shutdownCh: + return + } + case <-n.shutdownCh: + return + } + } +} + +// AppendEntries is used to pipeline a new append entries request. +func (n *netPipeline) AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) { + // Create a new future + future := &appendFuture{ + start: time.Now(), + args: args, + resp: resp, + } + future.init() + + // Add a send timeout + if timeout := n.trans.timeout; timeout > 0 { + n.conn.conn.SetWriteDeadline(time.Now().Add(timeout)) + } + + // Send the RPC + if err := sendRPC(n.conn, rpcAppendEntries, future.args); err != nil { + return nil, err + } + + // Hand-off for decoding, this can also cause back-pressure + // to prevent too many inflight requests + select { + case n.inprogressCh <- future: + return future, nil + case <-n.shutdownCh: + return nil, ErrPipelineShutdown + } +} + +// Consumer returns a channel that can be used to consume complete futures. +func (n *netPipeline) Consumer() <-chan AppendFuture { + return n.doneCh +} + +// Closed is used to shutdown the pipeline connection. +func (n *netPipeline) Close() error { + n.shutdownLock.Lock() + defer n.shutdownLock.Unlock() + if n.shutdown { + return nil + } + + // Release the connection + n.conn.Release() + + n.shutdown = true + close(n.shutdownCh) + return nil +} diff --git a/go/vt/orchestrator/external/raft/net_transport_test.go b/go/vt/orchestrator/external/raft/net_transport_test.go new file mode 100644 index 0000000000..88b04c2d84 --- /dev/null +++ b/go/vt/orchestrator/external/raft/net_transport_test.go @@ -0,0 +1,449 @@ +package raft + +import ( + "bytes" + "reflect" + "sync" + "testing" + "time" +) + +func TestNetworkTransport_StartStop(t *testing.T) { + trans, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + trans.Close() +} + +func TestNetworkTransport_Heartbeat_FastPath(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + invoked := false + fastpath := func(rpc RPC) { + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + invoked = true + } + trans1.SetHeartbeatHandler(fastpath) + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + + // Ensure fast-path is used + if !invoked { + t.Fatalf("fast-path not used") + } +} + +func TestNetworkTransport_AppendEntries(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + { + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_AppendEntriesPipeline(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + { + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for i := 0; i < 10; i++ { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr()) + if err != nil { + t.Fatalf("err: %v", err) + } + defer pipeline.Close() + for i := 0; i < 10; i++ { + out := new(AppendEntriesResponse) + if _, err := pipeline.AppendEntries(&args, out); err != nil { + t.Fatalf("err: %v", err) + } + } + + respCh := pipeline.Consumer() + for i := 0; i < 10; i++ { + select { + case ready := <-respCh: + // Verify the response + if !reflect.DeepEqual(&resp, ready.Response()) { + t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response()) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } +} + +func TestNetworkTransport_RequestVote(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := RequestVoteRequest{ + Term: 20, + Candidate: []byte("butters"), + LastLogIndex: 100, + LastLogTerm: 19, + } + resp := RequestVoteResponse{ + Term: 100, + Peers: []byte("blah"), + Granted: false, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*RequestVoteRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + var out RequestVoteResponse + if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_InstallSnapshot(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := InstallSnapshotRequest{ + Term: 10, + Leader: []byte("kyle"), + LastLogIndex: 100, + LastLogTerm: 9, + Peers: []byte("blah blah"), + Size: 10, + } + resp := InstallSnapshotResponse{ + Term: 10, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*InstallSnapshotRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + // Try to read the bytes + buf := make([]byte, 10) + rpc.Reader.Read(buf) + + // Compare + if bytes.Compare(buf, []byte("0123456789")) != 0 { + t.Fatalf("bad buf %v", buf) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + // Create a buffer + buf := bytes.NewBuffer([]byte("0123456789")) + + var out InstallSnapshotResponse + if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } +} + +func TestNetworkTransport_EncodeDecode(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + + local := trans1.LocalAddr() + enc := trans1.EncodePeer(local) + dec := trans1.DecodePeer(enc) + + if dec != local { + t.Fatalf("enc/dec fail: %v %v", dec, local) + } +} + +func TestNetworkTransport_PooledConn(t *testing.T) { + // Transport 1 is consumer + trans1, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 2, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + { + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + return + } + } + }() + + // Transport 2 makes outbound request, 3 conn pool + trans2, err := NewTCPTransportWithLogger("127.0.0.1:0", nil, 3, time.Second, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + defer trans2.Close() + + // Create wait group + wg := &sync.WaitGroup{} + wg.Add(5) + + appendFunc := func() { + defer wg.Done() + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } + + // Try to do parallel appends, should stress the conn pool + for i := 0; i < 5; i++ { + go appendFunc() + } + + // Wait for the routines to finish + wg.Wait() + + // Check the conn pool size + addr := trans1.LocalAddr() + if len(trans2.connPool[addr]) != 3 { + t.Fatalf("Expected 2 pooled conns!") + } +} diff --git a/go/vt/orchestrator/external/raft/observer.go b/go/vt/orchestrator/external/raft/observer.go new file mode 100644 index 0000000000..d41f765a2f --- /dev/null +++ b/go/vt/orchestrator/external/raft/observer.go @@ -0,0 +1,120 @@ +package raft + +import ( + "sync/atomic" +) + +// Observation is sent along the given channel to observers when an event occurs. +type Observation struct { + // Raft holds the Raft instance generating the observation. + Raft *Raft + // Data holds observation-specific data. Possible types are + // *RequestVoteRequest, RaftState and LeaderObservation. + Data interface{} +} + +// LeaderObservation is used in Observation.Data when leadership changes. +type LeaderObservation struct { + Leader string +} + +// nextObserverId is used to provide a unique ID for each observer to aid in +// deregistration. +var nextObserverID uint64 + +// FilterFn is a function that can be registered in order to filter observations. +// The function reports whether the observation should be included - if +// it returns false, the observation will be filtered out. +type FilterFn func(o *Observation) bool + +// Observer describes what to do with a given observation. +type Observer struct { + // channel receives observations. + channel chan Observation + + // blocking, if true, will cause Raft to block when sending an observation + // to this observer. This should generally be set to false. + blocking bool + + // filter will be called to determine if an observation should be sent to + // the channel. + filter FilterFn + + // id is the ID of this observer in the Raft map. + id uint64 + + // numObserved and numDropped are performance counters for this observer. + numObserved uint64 + numDropped uint64 +} + +// NewObserver creates a new observer that can be registered +// to make observations on a Raft instance. Observations +// will be sent on the given channel if they satisfy the +// given filter. +// +// If blocking is true, the observer will block when it can't +// send on the channel, otherwise it may discard events. +func NewObserver(channel chan Observation, blocking bool, filter FilterFn) *Observer { + return &Observer{ + channel: channel, + blocking: blocking, + filter: filter, + id: atomic.AddUint64(&nextObserverID, 1), + } +} + +// GetNumObserved returns the number of observations. +func (or *Observer) GetNumObserved() uint64 { + return atomic.LoadUint64(&or.numObserved) +} + +// GetNumDropped returns the number of dropped observations due to blocking. +func (or *Observer) GetNumDropped() uint64 { + return atomic.LoadUint64(&or.numDropped) +} + +// RegisterObserver registers a new observer. +func (r *Raft) RegisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + r.observers[or.id] = or +} + +// DeregisterObserver deregisters an observer. +func (r *Raft) DeregisterObserver(or *Observer) { + r.observersLock.Lock() + defer r.observersLock.Unlock() + delete(r.observers, or.id) +} + +// observe sends an observation to every observer. +func (r *Raft) observe(o interface{}) { + // In general observers should not block. But in any case this isn't + // disastrous as we only hold a read lock, which merely prevents + // registration / deregistration of observers. + r.observersLock.RLock() + defer r.observersLock.RUnlock() + for _, or := range r.observers { + // It's wasteful to do this in the loop, but for the common case + // where there are no observers we won't create any objects. + ob := Observation{Raft: r, Data: o} + if or.filter != nil && !or.filter(&ob) { + continue + } + if or.channel == nil { + continue + } + if or.blocking { + or.channel <- ob + atomic.AddUint64(&or.numObserved, 1) + } else { + select { + case or.channel <- ob: + atomic.AddUint64(&or.numObserved, 1) + default: + atomic.AddUint64(&or.numDropped, 1) + } + } + } +} diff --git a/go/vt/orchestrator/external/raft/peer.go b/go/vt/orchestrator/external/raft/peer.go new file mode 100644 index 0000000000..6f3bcf8564 --- /dev/null +++ b/go/vt/orchestrator/external/raft/peer.go @@ -0,0 +1,122 @@ +package raft + +import ( + "bytes" + "encoding/json" + "io/ioutil" + "os" + "path/filepath" + "sync" +) + +const ( + jsonPeerPath = "peers.json" +) + +// PeerStore provides an interface for persistent storage and +// retrieval of peers. We use a separate interface than StableStore +// since the peers may need to be edited by a human operator. For example, +// in a two node cluster, the failure of either node requires human intervention +// since consensus is impossible. +type PeerStore interface { + // Peers returns the list of known peers. + Peers() ([]string, error) + + // SetPeers sets the list of known peers. This is invoked when a peer is + // added or removed. + SetPeers([]string) error +} + +// StaticPeers is used to provide a static list of peers. +type StaticPeers struct { + StaticPeers []string + l sync.Mutex +} + +// Peers implements the PeerStore interface. +func (s *StaticPeers) Peers() ([]string, error) { + s.l.Lock() + peers := s.StaticPeers + s.l.Unlock() + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (s *StaticPeers) SetPeers(p []string) error { + s.l.Lock() + s.StaticPeers = p + s.l.Unlock() + return nil +} + +// JSONPeers is used to provide peer persistence on disk in the form +// of a JSON file. This allows human operators to manipulate the file. +type JSONPeers struct { + l sync.Mutex + path string + trans Transport +} + +// NewJSONPeers creates a new JSONPeers store. Requires a transport +// to handle the serialization of network addresses. +func NewJSONPeers(base string, trans Transport) *JSONPeers { + path := filepath.Join(base, jsonPeerPath) + store := &JSONPeers{ + path: path, + trans: trans, + } + return store +} + +// Peers implements the PeerStore interface. +func (j *JSONPeers) Peers() ([]string, error) { + j.l.Lock() + defer j.l.Unlock() + + // Read the file + buf, err := ioutil.ReadFile(j.path) + if err != nil && !os.IsNotExist(err) { + return nil, err + } + + // Check for no peers + if len(buf) == 0 { + return nil, nil + } + + // Decode the peers + var peerSet []string + dec := json.NewDecoder(bytes.NewReader(buf)) + if err := dec.Decode(&peerSet); err != nil { + return nil, err + } + + // Deserialize each peer + var peers []string + for _, p := range peerSet { + peers = append(peers, j.trans.DecodePeer([]byte(p))) + } + return peers, nil +} + +// SetPeers implements the PeerStore interface. +func (j *JSONPeers) SetPeers(peers []string) error { + j.l.Lock() + defer j.l.Unlock() + + // Encode each peer + var peerSet []string + for _, p := range peers { + peerSet = append(peerSet, string(j.trans.EncodePeer(p))) + } + + // Convert to JSON + var buf bytes.Buffer + enc := json.NewEncoder(&buf) + if err := enc.Encode(peerSet); err != nil { + return err + } + + // Write out as JSON + return ioutil.WriteFile(j.path, buf.Bytes(), 0755) +} diff --git a/go/vt/orchestrator/external/raft/peer_test.go b/go/vt/orchestrator/external/raft/peer_test.go new file mode 100644 index 0000000000..ff835e026a --- /dev/null +++ b/go/vt/orchestrator/external/raft/peer_test.go @@ -0,0 +1,44 @@ +package raft + +import ( + "io/ioutil" + "os" + "testing" +) + +func TestJSONPeers(t *testing.T) { + // Create a test dir + dir, err := ioutil.TempDir("", "raft") + if err != nil { + t.Fatalf("err: %v ", err) + } + defer os.RemoveAll(dir) + + // Create the store + _, trans := NewInmemTransport("") + store := NewJSONPeers(dir, trans) + + // Try a read, should get nothing + peers, err := store.Peers() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(peers) != 0 { + t.Fatalf("peers: %v", peers) + } + + // Initialize some peers + newPeers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + if err := store.SetPeers(newPeers); err != nil { + t.Fatalf("err: %v", err) + } + + // Try a read, should peers + peers, err = store.Peers() + if err != nil { + t.Fatalf("err: %v", err) + } + if len(peers) != 3 { + t.Fatalf("peers: %v", peers) + } +} diff --git a/go/vt/orchestrator/external/raft/raft.go b/go/vt/orchestrator/external/raft/raft.go new file mode 100644 index 0000000000..bd630a495d --- /dev/null +++ b/go/vt/orchestrator/external/raft/raft.go @@ -0,0 +1,1958 @@ +package raft + +import ( + "bytes" + "errors" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "strconv" + "sync" + "sync/atomic" + "time" + + "github.com/armon/go-metrics" +) + +const ( + minCheckInterval = 10 * time.Millisecond +) + +var ( + keyCurrentTerm = []byte("CurrentTerm") + keyLastVoteTerm = []byte("LastVoteTerm") + keyLastVoteCand = []byte("LastVoteCand") + + // ErrLeader is returned when an operation can't be completed on a + // leader node. + ErrLeader = errors.New("node is the leader") + + // ErrNotLeader is returned when an operation can't be completed on a + // follower or candidate node. + ErrNotLeader = errors.New("node is not the leader") + + // ErrLeadershipLost is returned when a leader fails to commit a log entry + // because it's been deposed in the process. + ErrLeadershipLost = errors.New("leadership lost while committing log") + + // ErrRaftShutdown is returned when operations are requested against an + // inactive Raft. + ErrRaftShutdown = errors.New("raft is already shutdown") + + // ErrEnqueueTimeout is returned when a command fails due to a timeout. + ErrEnqueueTimeout = errors.New("timed out enqueuing operation") + + // ErrKnownPeer is returned when trying to add a peer to the configuration + // that already exists. + ErrKnownPeer = errors.New("peer already known") + + // ErrUnknownPeer is returned when trying to remove a peer from the + // configuration that doesn't exist. + ErrUnknownPeer = errors.New("peer is unknown") + + // ErrNothingNewToSnapshot is returned when trying to create a snapshot + // but there's nothing new commited to the FSM since we started. + ErrNothingNewToSnapshot = errors.New("Nothing new to snapshot") +) + +// commitTuple is used to send an index that was committed, +// with an optional associated future that should be invoked. +type commitTuple struct { + log *Log + future *logFuture +} + +// leaderState is state that is used while we are a leader. +type leaderState struct { + commitCh chan struct{} + inflight *inflight + replState map[string]*followerReplication + notify map[*verifyFuture]struct{} + stepDown chan struct{} +} + +// Raft implements a Raft node. +type Raft struct { + raftState + + // applyCh is used to async send logs to the main thread to + // be committed and applied to the FSM. + applyCh chan *logFuture + + // Configuration provided at Raft initialization + conf *Config + + // FSM is the client state machine to apply commands to + fsm FSM + + // fsmCommitCh is used to trigger async application of logs to the fsm + fsmCommitCh chan commitTuple + + // fsmRestoreCh is used to trigger a restore from snapshot + fsmRestoreCh chan *restoreFuture + + // fsmSnapshotCh is used to trigger a new snapshot being taken + fsmSnapshotCh chan *reqSnapshotFuture + + // lastContact is the last time we had contact from the + // leader node. This can be used to gauge staleness. + lastContact time.Time + lastContactLock sync.RWMutex + + // Leader is the current cluster leader + leader string + leaderLock sync.RWMutex + + // leaderCh is used to notify of leadership changes + leaderCh chan bool + + // leaderState used only while state is leader + leaderState leaderState + + // Stores our local addr + localAddr string + + // Used for our logging + logger *log.Logger + + // LogStore provides durable storage for logs + logs LogStore + + // Track our known peers + peerCh chan *peerFuture + peers []string + peerStore PeerStore + + // RPC chan comes from the transport layer + rpcCh <-chan RPC + + // Shutdown channel to exit, protected to prevent concurrent exits + shutdown bool + shutdownCh chan struct{} + shutdownLock sync.Mutex + + // snapshots is used to store and retrieve snapshots + snapshots SnapshotStore + + // snapshotCh is used for user triggered snapshots + snapshotCh chan *snapshotFuture + + // stable is a StableStore implementation for durable state + // It provides stable storage for many fields in raftState + stable StableStore + + // The transport layer we use + trans Transport + + // verifyCh is used to async send verify futures to the main thread + // to verify we are still the leader + verifyCh chan *verifyFuture + + // List of observers and the mutex that protects them. The observers list + // is indexed by an artificial ID which is used for deregistration. + observersLock sync.RWMutex + observers map[uint64]*Observer + + // suspendLeadership is a hint for Raft to not become a leader. This flag is bound by time, and can be used + // to control the identity of the leader in a (stable) group + suspendLeadership int64 +} + +// NewRaft is used to construct a new Raft node. It takes a configuration, as well +// as implementations of various interfaces that are required. If we have any old state, +// such as snapshots, logs, peers, etc, all those will be restored when creating the +// Raft node. +func NewRaft(conf *Config, fsm FSM, logs LogStore, stable StableStore, snaps SnapshotStore, + peerStore PeerStore, trans Transport) (*Raft, error) { + // Validate the configuration + if err := ValidateConfig(conf); err != nil { + return nil, err + } + + // Ensure we have a LogOutput + var logger *log.Logger + if conf.Logger != nil { + logger = conf.Logger + } else { + if conf.LogOutput == nil { + conf.LogOutput = os.Stderr + } + logger = log.New(conf.LogOutput, "", log.LstdFlags) + } + + // Try to restore the current term + currentTerm, err := stable.GetUint64(keyCurrentTerm) + if err != nil && err.Error() != "not found" { + return nil, fmt.Errorf("failed to load current term: %v", err) + } + + // Read the last log value + lastIdx, err := logs.LastIndex() + if err != nil { + return nil, fmt.Errorf("failed to find last log: %v", err) + } + + // Get the log + var lastLog Log + if lastIdx > 0 { + if err = logs.GetLog(lastIdx, &lastLog); err != nil { + return nil, fmt.Errorf("failed to get last log: %v", err) + } + } + + // Construct the list of peers that excludes us + localAddr := trans.LocalAddr() + peers, err := peerStore.Peers() + if err != nil { + return nil, fmt.Errorf("failed to get list of peers: %v", err) + } + peers = ExcludePeer(peers, localAddr) + + // Create Raft struct + r := &Raft{ + applyCh: make(chan *logFuture), + conf: conf, + fsm: fsm, + fsmCommitCh: make(chan commitTuple, 128), + fsmRestoreCh: make(chan *restoreFuture), + fsmSnapshotCh: make(chan *reqSnapshotFuture), + leaderCh: make(chan bool), + localAddr: localAddr, + logger: logger, + logs: logs, + peerCh: make(chan *peerFuture), + peers: peers, + peerStore: peerStore, + rpcCh: trans.Consumer(), + snapshots: snaps, + snapshotCh: make(chan *snapshotFuture), + shutdownCh: make(chan struct{}), + stable: stable, + trans: trans, + verifyCh: make(chan *verifyFuture, 64), + observers: make(map[uint64]*Observer), + } + + // Initialize as a follower + r.setState(Follower) + + // Start as leader if specified. This should only be used + // for testing purposes. + if conf.StartAsLeader { + r.setState(Leader) + r.setLeader(r.localAddr) + } + + // Restore the current term and the last log + r.setCurrentTerm(currentTerm) + r.setLastLog(lastLog.Index, lastLog.Term) + + // Attempt to restore a snapshot if there are any + if err := r.restoreSnapshot(); err != nil { + return nil, err + } + + // Setup a heartbeat fast-path to avoid head-of-line + // blocking where possible. It MUST be safe for this + // to be called concurrently with a blocking RPC. + trans.SetHeartbeatHandler(r.processHeartbeat) + + // Start the background work + r.goFunc(r.run) + r.goFunc(r.runFSM) + r.goFunc(r.runSnapshots) + return r, nil +} + +// Leader is used to return the current leader of the cluster. +// It may return empty string if there is no current leader +// or the leader is unknown. +func (r *Raft) Leader() string { + r.leaderLock.RLock() + leader := r.leader + r.leaderLock.RUnlock() + return leader +} + +// setLeader is used to modify the current leader of the cluster +func (r *Raft) setLeader(leader string) { + r.leaderLock.Lock() + oldLeader := r.leader + r.leader = leader + r.leaderLock.Unlock() + if oldLeader != leader { + r.observe(LeaderObservation{Leader: leader}) + } +} + +// Apply is used to apply a command to the FSM in a highly consistent +// manner. This returns a future that can be used to wait on the application. +// An optional timeout can be provided to limit the amount of time we wait +// for the command to be started. This must be run on the leader or it +// will fail. +func (r *Raft) Apply(cmd []byte, timeout time.Duration) ApplyFuture { + metrics.IncrCounter([]string{"raft", "apply"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogCommand, + Data: cmd, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// Barrier is used to issue a command that blocks until all preceeding +// operations have been applied to the FSM. It can be used to ensure the +// FSM reflects all queued writes. An optional timeout can be provided to +// limit the amount of time we wait for the command to be started. This +// must be run on the leader or it will fail. +func (r *Raft) Barrier(timeout time.Duration) Future { + metrics.IncrCounter([]string{"raft", "barrier"}, 1) + var timer <-chan time.Time + if timeout > 0 { + timer = time.After(timeout) + } + + // Create a log future, no index or term yet + logFuture := &logFuture{ + log: Log{ + Type: LogBarrier, + }, + } + logFuture.init() + + select { + case <-timer: + return errorFuture{ErrEnqueueTimeout} + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.applyCh <- logFuture: + return logFuture + } +} + +// VerifyLeader is used to ensure the current node is still +// the leader. This can be done to prevent stale reads when a +// new leader has potentially been elected. +func (r *Raft) VerifyLeader() Future { + metrics.IncrCounter([]string{"raft", "verify_leader"}, 1) + verifyFuture := &verifyFuture{} + verifyFuture.init() + select { + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + case r.verifyCh <- verifyFuture: + return verifyFuture + } +} + +// AddPeer is used to add a new peer into the cluster. This must be +// run on the leader or it will fail. +func (r *Raft) AddPeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogAddPeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// RemovePeer is used to remove a peer from the cluster. If the +// current leader is being removed, it will cause a new election +// to occur. This must be run on the leader or it will fail. +func (r *Raft) RemovePeer(peer string) Future { + logFuture := &logFuture{ + log: Log{ + Type: LogRemovePeer, + peer: peer, + }, + } + logFuture.init() + select { + case r.applyCh <- logFuture: + return logFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// SetPeers is used to forcibly replace the set of internal peers and +// the peerstore with the ones specified. This can be considered unsafe. +func (r *Raft) SetPeers(p []string) Future { + peerFuture := &peerFuture{ + peers: p, + } + peerFuture.init() + + select { + case r.peerCh <- peerFuture: + return peerFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } +} + +// Shutdown is used to stop the Raft background routines. +// This is not a graceful operation. Provides a future that +// can be used to block until all background routines have exited. +func (r *Raft) Shutdown() Future { + r.shutdownLock.Lock() + defer r.shutdownLock.Unlock() + + if !r.shutdown { + close(r.shutdownCh) + r.shutdown = true + r.setState(Shutdown) + return &shutdownFuture{r} + } + + // avoid closing transport twice + return &shutdownFuture{nil} +} + +// Snapshot is used to manually force Raft to take a snapshot. +// Returns a future that can be used to block until complete. +func (r *Raft) Snapshot() Future { + snapFuture := &snapshotFuture{} + snapFuture.init() + select { + case r.snapshotCh <- snapFuture: + return snapFuture + case <-r.shutdownCh: + return errorFuture{ErrRaftShutdown} + } + +} + +// State is used to return the current raft state. +func (r *Raft) State() RaftState { + return r.getState() +} + +// LeaderCh is used to get a channel which delivers signals on +// acquiring or losing leadership. It sends true if we become +// the leader, and false if we lose it. The channel is not buffered, +// and does not block on writes. +func (r *Raft) LeaderCh() <-chan bool { + return r.leaderCh +} + +func (r *Raft) String() string { + return fmt.Sprintf("Node at %s [%v]", r.localAddr, r.getState()) +} + +// LastContact returns the time of last contact by a leader. +// This only makes sense if we are currently a follower. +func (r *Raft) LastContact() time.Time { + r.lastContactLock.RLock() + last := r.lastContact + r.lastContactLock.RUnlock() + return last +} + +// Stats is used to return a map of various internal stats. This +// should only be used for informative purposes or debugging. +// +// Keys are: "state", "term", "last_log_index", "last_log_term", +// "commit_index", "applied_index", "fsm_pending", +// "last_snapshot_index", "last_snapshot_term", "num_peers" and +// "last_contact". +// +// The value of "state" is a numerical value representing a +// RaftState const. +// +// The value of "last_contact" is either "never" if there +// has been no contact with a leader, "0" if the node is in the +// leader state, or the time since last contact with a leader +// formatted as a string. +// +// All other values are uint64s, formatted as strings. +func (r *Raft) Stats() map[string]string { + toString := func(v uint64) string { + return strconv.FormatUint(v, 10) + } + lastLogIndex, lastLogTerm := r.getLastLog() + lastSnapIndex, lastSnapTerm := r.getLastSnapshot() + s := map[string]string{ + "state": r.getState().String(), + "term": toString(r.getCurrentTerm()), + "last_log_index": toString(lastLogIndex), + "last_log_term": toString(lastLogTerm), + "commit_index": toString(r.getCommitIndex()), + "applied_index": toString(r.getLastApplied()), + "fsm_pending": toString(uint64(len(r.fsmCommitCh))), + "last_snapshot_index": toString(lastSnapIndex), + "last_snapshot_term": toString(lastSnapTerm), + "num_peers": toString(uint64(len(r.peers))), + } + last := r.LastContact() + if last.IsZero() { + s["last_contact"] = "never" + } else if r.getState() == Leader { + s["last_contact"] = "0" + } else { + s["last_contact"] = fmt.Sprintf("%v", time.Now().Sub(last)) + } + return s +} + +// LastIndex returns the last index in stable storage, +// either from the last log or from the last snapshot. +func (r *Raft) LastIndex() uint64 { + return r.getLastIndex() +} + +// AppliedIndex returns the last index applied to the FSM. This is generally +// lagging behind the last index, especially for indexes that are persisted but +// have not yet been considered committed by the leader. NOTE - this reflects +// the last index that was sent to the application's FSM over the apply channel +// but DOES NOT mean that the application's FSM has yet consumed it and applied +// it to its internal state. Thus, the application's state may lag behind this +// index. +func (r *Raft) AppliedIndex() uint64 { + return r.getLastApplied() +} + +// runFSM is a long running goroutine responsible for applying logs +// to the FSM. This is done async of other logs since we don't want +// the FSM to block our internal operations. +func (r *Raft) runFSM() { + var lastIndex, lastTerm uint64 + for { + select { + case req := <-r.fsmRestoreCh: + // Open the snapshot + meta, source, err := r.snapshots.Open(req.ID) + if err != nil { + req.respond(fmt.Errorf("failed to open snapshot %v: %v", req.ID, err)) + continue + } + + // Attempt to restore + start := time.Now() + if err := r.fsm.Restore(source); err != nil { + req.respond(fmt.Errorf("failed to restore snapshot %v: %v", req.ID, err)) + source.Close() + continue + } + source.Close() + metrics.MeasureSince([]string{"raft", "fsm", "restore"}, start) + + // Update the last index and term + lastIndex = meta.Index + lastTerm = meta.Term + req.respond(nil) + + case req := <-r.fsmSnapshotCh: + // Is there something to snapshot? + if lastIndex == 0 { + req.respond(ErrNothingNewToSnapshot) + continue + } + + // Get our peers + peers, err := r.peerStore.Peers() + if err != nil { + req.respond(err) + continue + } + + // Start a snapshot + start := time.Now() + snap, err := r.fsm.Snapshot() + metrics.MeasureSince([]string{"raft", "fsm", "snapshot"}, start) + + // Respond to the request + req.index = lastIndex + req.term = lastTerm + req.peers = peers + req.snapshot = snap + req.respond(err) + + case commitEntry := <-r.fsmCommitCh: + // Apply the log if a command + var resp interface{} + if commitEntry.log.Type == LogCommand { + start := time.Now() + resp = r.fsm.Apply(commitEntry.log) + metrics.MeasureSince([]string{"raft", "fsm", "apply"}, start) + } + + // Update the indexes + lastIndex = commitEntry.log.Index + lastTerm = commitEntry.log.Term + + // Invoke the future if given + if commitEntry.future != nil { + commitEntry.future.response = resp + commitEntry.future.respond(nil) + } + case <-r.shutdownCh: + return + } + } +} + +// run is a long running goroutine that runs the Raft FSM. +func (r *Raft) run() { + for { + // Check if we are doing a shutdown + select { + case <-r.shutdownCh: + // Clear the leader to prevent forwarding + r.setLeader("") + return + default: + } + + // Enter into a sub-FSM + switch r.getState() { + case Follower: + r.runFollower() + case Candidate: + r.runCandidate() + case Leader: + r.runLeader() + } + } +} + +// runFollower runs the FSM for a follower. +func (r *Raft) runFollower() { + didWarn := false + r.logger.Printf("[INFO] raft: %v entering Follower state (Leader: %q)", r, r.Leader()) + metrics.IncrCounter([]string{"raft", "state", "follower"}, 1) + heartbeatTimer := randomTimeout(r.conf.HeartbeatTimeout) + for { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + + case <-heartbeatTimer: + // Restart the heartbeat timer + heartbeatTimer = randomTimeout(r.conf.HeartbeatTimeout) + + // Check if we have had a successful contact + lastContact := r.LastContact() + if time.Now().Sub(lastContact) < r.conf.HeartbeatTimeout { + continue + } + + // Heartbeat failed! Transition to the candidate state + lastLeader := r.Leader() + r.setLeader("") + if len(r.peers) == 0 && !r.conf.EnableSingleNode { + if !didWarn { + r.logger.Printf("[WARN] raft: EnableSingleNode disabled, and no known peers. Aborting election.") + didWarn = true + } + } else { + if atomic.LoadInt64(&r.suspendLeadership) > 0 { + r.logger.Printf(`[WARN] raft: Heartbeat timeout from %q reached, but leadership suspended. Will not enter Candidate mode`, lastLeader) + return + } + r.logger.Printf(`[WARN] raft: Heartbeat timeout from %q reached, starting election`, lastLeader) + + metrics.IncrCounter([]string{"raft", "transition", "heartbeat_timeout"}, 1) + r.setState(Candidate) + return + } + + case <-r.shutdownCh: + return + } + } +} + +// runCandidate runs the FSM for a candidate. +func (r *Raft) runCandidate() { + r.logger.Printf("[INFO] raft: %v entering Candidate state", r) + metrics.IncrCounter([]string{"raft", "state", "candidate"}, 1) + + // Start vote for us, and set a timeout + voteCh := r.electSelf() + electionTimer := randomTimeout(r.conf.ElectionTimeout) + + // Tally the votes, need a simple majority + grantedVotes := 0 + votesNeeded := r.quorumSize() + r.logger.Printf("[DEBUG] raft: Votes needed: %d", votesNeeded) + + for r.getState() == Candidate { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case vote := <-voteCh: + // Check if the term is greater than ours, bail + if vote.Term > r.getCurrentTerm() { + r.logger.Printf("[DEBUG] raft: Newer term discovered, fallback to follower") + r.setState(Follower) + r.setCurrentTerm(vote.Term) + return + } + + // Check if the vote is granted + if vote.Granted { + grantedVotes++ + r.logger.Printf("[DEBUG] raft: Vote granted from %s. Tally: %d", vote.voter, grantedVotes) + } + + // Check if we've become the leader + if grantedVotes >= votesNeeded { + r.logger.Printf("[INFO] raft: Election won. Tally: %d", grantedVotes) + r.setState(Leader) + r.setLeader(r.localAddr) + return + } + + case a := <-r.applyCh: + // Reject any operations since we are not the leader + a.respond(ErrNotLeader) + + case v := <-r.verifyCh: + // Reject any operations since we are not the leader + v.respond(ErrNotLeader) + + case p := <-r.peerCh: + // Set the peers + r.peers = ExcludePeer(p.peers, r.localAddr) + p.respond(r.peerStore.SetPeers(p.peers)) + // Become a follower again + r.setState(Follower) + return + + case <-electionTimer: + // Election failed! Restart the election. We simply return, + // which will kick us back into runCandidate + r.logger.Printf("[WARN] raft: Election timeout reached, restarting election") + return + + case <-r.shutdownCh: + return + } + } +} + +// runLeader runs the FSM for a leader. Do the setup here and drop into +// the leaderLoop for the hot loop. +func (r *Raft) runLeader() { + r.logger.Printf("[INFO] raft: %v entering Leader state", r) + metrics.IncrCounter([]string{"raft", "state", "leader"}, 1) + + // Notify that we are the leader + asyncNotifyBool(r.leaderCh, true) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- true: + case <-r.shutdownCh: + } + } + + // Setup leader state + r.leaderState.commitCh = make(chan struct{}, 1) + r.leaderState.inflight = newInflight(r.leaderState.commitCh) + r.leaderState.replState = make(map[string]*followerReplication) + r.leaderState.notify = make(map[*verifyFuture]struct{}) + r.leaderState.stepDown = make(chan struct{}, 1) + + // Cleanup state on step down + defer func() { + // Since we were the leader previously, we update our + // last contact time when we step down, so that we are not + // reporting a last contact time from before we were the + // leader. Otherwise, to a client it would seem our data + // is extremely stale. + r.setLastContact() + + // Stop replication + for _, p := range r.leaderState.replState { + close(p.stopCh) + } + + // Cancel inflight requests + r.leaderState.inflight.Cancel(ErrLeadershipLost) + + // Respond to any pending verify requests + for future := range r.leaderState.notify { + future.respond(ErrLeadershipLost) + } + + // Clear all the state + r.leaderState.commitCh = nil + r.leaderState.inflight = nil + r.leaderState.replState = nil + r.leaderState.notify = nil + r.leaderState.stepDown = nil + + // If we are stepping down for some reason, no known leader. + // We may have stepped down due to an RPC call, which would + // provide the leader, so we cannot always blank this out. + r.leaderLock.Lock() + if r.leader == r.localAddr { + r.leader = "" + } + r.leaderLock.Unlock() + + // Notify that we are not the leader + asyncNotifyBool(r.leaderCh, false) + + // Push to the notify channel if given + if notify := r.conf.NotifyCh; notify != nil { + select { + case notify <- false: + case <-r.shutdownCh: + // On shutdown, make a best effort but do not block + select { + case notify <- false: + default: + } + } + } + }() + + // Start a replication routine for each peer + for _, peer := range r.peers { + r.startReplication(peer) + } + + // Dispatch a no-op log first. Instead of LogNoop, + // we use a LogAddPeer with our peerset. This acts like + // a no-op as well, but when doing an initial bootstrap, ensures + // that all nodes share a common peerset. + peerSet := append([]string{r.localAddr}, r.peers...) + noop := &logFuture{ + log: Log{ + Type: LogAddPeer, + Data: encodePeers(peerSet, r.trans), + }, + } + r.dispatchLogs([]*logFuture{noop}) + + // Disable EnableSingleNode after we've been elected leader. + // This is to prevent a split brain in the future, if we are removed + // from the cluster and then elect ourself as leader. + if r.conf.DisableBootstrapAfterElect && r.conf.EnableSingleNode { + r.logger.Printf("[INFO] raft: Disabling EnableSingleNode (bootstrap)") + r.conf.EnableSingleNode = false + } + + // Sit in the leader loop until we step down + r.leaderLoop() +} + +// startReplication is a helper to setup state and start async replication to a peer. +func (r *Raft) startReplication(peer string) { + lastIdx := r.getLastIndex() + s := &followerReplication{ + peer: peer, + inflight: r.leaderState.inflight, + stopCh: make(chan uint64, 1), + triggerCh: make(chan struct{}, 1), + currentTerm: r.getCurrentTerm(), + matchIndex: 0, + nextIndex: lastIdx + 1, + lastContact: time.Now(), + notifyCh: make(chan struct{}, 1), + stepDown: r.leaderState.stepDown, + } + r.leaderState.replState[peer] = s + r.goFunc(func() { r.replicate(s) }) + asyncNotifyCh(s.triggerCh) +} + +// leaderLoop is the hot loop for a leader. It is invoked +// after all the various leader setup is done. +func (r *Raft) leaderLoop() { + // stepDown is used to track if there is an inflight log that + // would cause us to lose leadership (specifically a RemovePeer of + // ourselves). If this is the case, we must not allow any logs to + // be processed in parallel, otherwise we are basing commit on + // only a single peer (ourself) and replicating to an undefined set + // of peers. + stepDown := false + + lease := time.After(r.conf.LeaderLeaseTimeout) + for r.getState() == Leader { + select { + case rpc := <-r.rpcCh: + r.processRPC(rpc) + + case <-r.leaderState.stepDown: + r.setState(Follower) + + case <-r.leaderState.commitCh: + // Get the committed messages + committed := r.leaderState.inflight.Committed() + for e := committed.Front(); e != nil; e = e.Next() { + // Measure the commit time + commitLog := e.Value.(*logFuture) + metrics.MeasureSince([]string{"raft", "commitTime"}, commitLog.dispatch) + + // Increment the commit index + idx := commitLog.log.Index + r.setCommitIndex(idx) + r.processLogs(idx, commitLog) + } + + case v := <-r.verifyCh: + if v.quorumSize == 0 { + // Just dispatched, start the verification + r.verifyLeader(v) + + } else if v.votes < v.quorumSize { + // Early return, means there must be a new leader + r.logger.Printf("[WARN] raft: New leader elected, stepping down") + r.setState(Follower) + delete(r.leaderState.notify, v) + v.respond(ErrNotLeader) + + } else { + // Quorum of members agree, we are still leader + delete(r.leaderState.notify, v) + v.respond(nil) + } + + case p := <-r.peerCh: + p.respond(ErrLeader) + + case newLog := <-r.applyCh: + // Group commit, gather all the ready commits + ready := []*logFuture{newLog} + for i := 0; i < r.conf.MaxAppendEntries; i++ { + select { + case newLog := <-r.applyCh: + ready = append(ready, newLog) + default: + break + } + } + + // Handle any peer set changes + n := len(ready) + for i := 0; i < n; i++ { + // Fail all future transactions once stepDown is on + if stepDown { + ready[i].respond(ErrNotLeader) + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Special case AddPeer and RemovePeer + log := ready[i] + if log.log.Type != LogAddPeer && log.log.Type != LogRemovePeer { + continue + } + + // Check if this log should be ignored. The logs can be + // reordered here since we have not yet assigned an index + // and are not violating any promises. + if !r.preparePeerChange(log) { + ready[i], ready[n-1] = ready[n-1], nil + n-- + i-- + continue + } + + // Apply peer set changes early and check if we will step + // down after the commit of this log. If so, we must not + // allow any future entries to make progress to avoid undefined + // behavior. + if ok := r.processLog(&log.log, nil, true); ok { + stepDown = true + } + } + + // Nothing to do if all logs are invalid + if n == 0 { + continue + } + + // Dispatch the logs + ready = ready[:n] + r.dispatchLogs(ready) + + case <-lease: + // Check if we've exceeded the lease, potentially stepping down + maxDiff := r.checkLeaderLease() + + // Next check interval should adjust for the last node we've + // contacted, without going negative + checkInterval := r.conf.LeaderLeaseTimeout - maxDiff + if checkInterval < minCheckInterval { + checkInterval = minCheckInterval + } + + // Renew the lease timer + lease = time.After(checkInterval) + + case <-r.shutdownCh: + return + } + } +} + +// verifyLeader must be called from the main thread for safety. +// Causes the followers to attempt an immediate heartbeat. +func (r *Raft) verifyLeader(v *verifyFuture) { + // Current leader always votes for self + v.votes = 1 + + // Set the quorum size, hot-path for single node + v.quorumSize = r.quorumSize() + if v.quorumSize == 1 { + v.respond(nil) + return + } + + // Track this request + v.notifyCh = r.verifyCh + r.leaderState.notify[v] = struct{}{} + + // Trigger immediate heartbeats + for _, repl := range r.leaderState.replState { + repl.notifyLock.Lock() + repl.notify = append(repl.notify, v) + repl.notifyLock.Unlock() + asyncNotifyCh(repl.notifyCh) + } +} + +// checkLeaderLease is used to check if we can contact a quorum of nodes +// within the last leader lease interval. If not, we need to step down, +// as we may have lost connectivity. Returns the maximum duration without +// contact. +func (r *Raft) checkLeaderLease() time.Duration { + // Track contacted nodes, we can always contact ourself + contacted := 1 + + // Check each follower + var maxDiff time.Duration + now := time.Now() + for peer, f := range r.leaderState.replState { + diff := now.Sub(f.LastContact()) + if diff <= r.conf.LeaderLeaseTimeout { + contacted++ + if diff > maxDiff { + maxDiff = diff + } + } else { + // Log at least once at high value, then debug. Otherwise it gets very verbose. + if diff <= 3*r.conf.LeaderLeaseTimeout { + r.logger.Printf("[WARN] raft: Failed to contact %v in %v", peer, diff) + } else { + r.logger.Printf("[DEBUG] raft: Failed to contact %v in %v", peer, diff) + } + } + metrics.AddSample([]string{"raft", "leader", "lastContact"}, float32(diff/time.Millisecond)) + } + + // Verify we can contact a quorum + quorum := r.quorumSize() + if contacted < quorum { + r.logger.Printf("[WARN] raft: Failed to contact quorum of nodes, stepping down") + r.setState(Follower) + metrics.IncrCounter([]string{"raft", "transition", "leader_lease_timeout"}, 1) + } + return maxDiff +} + +// quorumSize is used to return the quorum size +func (r *Raft) quorumSize() int { + return ((len(r.peers) + 1) / 2) + 1 +} + +// preparePeerChange checks if a LogAddPeer or LogRemovePeer should be performed, +// and properly formats the data field on the log before dispatching it. +func (r *Raft) preparePeerChange(l *logFuture) bool { + // Check if this is a known peer + p := l.log.peer + knownPeer := PeerContained(r.peers, p) || r.localAddr == p + + // Ignore known peers on add + if l.log.Type == LogAddPeer && knownPeer { + l.respond(ErrKnownPeer) + return false + } + + // Ignore unknown peers on remove + if l.log.Type == LogRemovePeer && !knownPeer { + l.respond(ErrUnknownPeer) + return false + } + + // Construct the peer set + var peerSet []string + if l.log.Type == LogAddPeer { + peerSet = append([]string{p, r.localAddr}, r.peers...) + } else { + peerSet = ExcludePeer(append([]string{r.localAddr}, r.peers...), p) + } + + // Setup the log + l.log.Data = encodePeers(peerSet, r.trans) + return true +} + +// dispatchLog is called to push a log to disk, mark it +// as inflight and begin replication of it. +func (r *Raft) dispatchLogs(applyLogs []*logFuture) { + now := time.Now() + defer metrics.MeasureSince([]string{"raft", "leader", "dispatchLog"}, now) + + term := r.getCurrentTerm() + lastIndex := r.getLastIndex() + logs := make([]*Log, len(applyLogs)) + + for idx, applyLog := range applyLogs { + applyLog.dispatch = now + applyLog.log.Index = lastIndex + uint64(idx) + 1 + applyLog.log.Term = term + applyLog.policy = newMajorityQuorum(len(r.peers) + 1) + logs[idx] = &applyLog.log + } + + // Write the log entry locally + if err := r.logs.StoreLogs(logs); err != nil { + r.logger.Printf("[ERR] raft: Failed to commit logs: %v", err) + for _, applyLog := range applyLogs { + applyLog.respond(err) + } + r.setState(Follower) + return + } + + // Add this to the inflight logs, commit + r.leaderState.inflight.StartAll(applyLogs) + + // Update the last log since it's on disk now + r.setLastLog(lastIndex+uint64(len(applyLogs)), term) + + // Notify the replicators of the new log + for _, f := range r.leaderState.replState { + asyncNotifyCh(f.triggerCh) + } +} + +// processLogs is used to process all the logs from the lastApplied +// up to the given index. +func (r *Raft) processLogs(index uint64, future *logFuture) { + // Reject logs we've applied already + lastApplied := r.getLastApplied() + if index <= lastApplied { + r.logger.Printf("[WARN] raft: Skipping application of old log: %d", index) + return + } + + // Apply all the preceding logs + for idx := r.getLastApplied() + 1; idx <= index; idx++ { + // Get the log, either from the future or from our log store + if future != nil && future.log.Index == idx { + r.processLog(&future.log, future, false) + + } else { + l := new(Log) + if err := r.logs.GetLog(idx, l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at %d: %v", idx, err) + panic(err) + } + r.processLog(l, nil, false) + } + + // Update the lastApplied index and term + r.setLastApplied(idx) + } +} + +// processLog is invoked to process the application of a single committed log. +// Returns if this log entry would cause us to stepDown after it commits. +func (r *Raft) processLog(l *Log, future *logFuture, precommit bool) (stepDown bool) { + switch l.Type { + case LogBarrier: + // Barrier is handled by the FSM + fallthrough + + case LogCommand: + // Forward to the fsm handler + select { + case r.fsmCommitCh <- commitTuple{l, future}: + case <-r.shutdownCh: + if future != nil { + future.respond(ErrRaftShutdown) + } + } + + // Return so that the future is only responded to + // by the FSM handler when the application is done + return + + case LogAddPeer: + fallthrough + case LogRemovePeer: + peers := decodePeers(l.Data, r.trans) + r.logger.Printf("[DEBUG] raft: Node %v updated peer set (%v): %v", r.localAddr, l.Type, peers) + + // If the peer set does not include us, remove all other peers + removeSelf := !PeerContained(peers, r.localAddr) && l.Type == LogRemovePeer + if removeSelf { + // Mark that this operation will cause us to step down as + // leader. This prevents the future logs from being Applied + // from this leader. + stepDown = true + + // We only modify the peers after the commit, otherwise we + // would be using a quorum size of 1 for the RemovePeer operation. + // This is used with the stepDown guard to prevent any other logs. + if !precommit { + r.peers = nil + r.peerStore.SetPeers([]string{r.localAddr}) + } + } else { + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + } + + // Handle replication if we are the leader + if r.getState() == Leader { + for _, p := range r.peers { + if _, ok := r.leaderState.replState[p]; !ok { + r.logger.Printf("[INFO] raft: Added peer %v, starting replication", p) + r.startReplication(p) + } + } + } + + // Stop replication for old nodes + if r.getState() == Leader && !precommit { + var toDelete []string + for _, repl := range r.leaderState.replState { + if !PeerContained(r.peers, repl.peer) { + r.logger.Printf("[INFO] raft: Removed peer %v, stopping replication (Index: %d)", repl.peer, l.Index) + + // Replicate up to this index and stop + repl.stopCh <- l.Index + close(repl.stopCh) + toDelete = append(toDelete, repl.peer) + } + } + for _, name := range toDelete { + delete(r.leaderState.replState, name) + } + } + + // Handle removing ourself + if removeSelf && !precommit { + if r.conf.ShutdownOnRemove { + r.logger.Printf("[INFO] raft: Removed ourself, shutting down") + r.Shutdown() + } else { + r.logger.Printf("[INFO] raft: Removed ourself, transitioning to follower") + r.setState(Follower) + } + } + + case LogNoop: + // Ignore the no-op + default: + r.logger.Printf("[ERR] raft: Got unrecognized log type: %#v", l) + } + + // Invoke the future if given + if future != nil && !precommit { + future.respond(nil) + } + return +} + +// processRPC is called to handle an incoming RPC request. +func (r *Raft) processRPC(rpc RPC) { + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + case *RequestVoteRequest: + r.requestVote(rpc, cmd) + case *InstallSnapshotRequest: + r.installSnapshot(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Got unexpected command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// processHeartbeat is a special handler used just for heartbeat requests +// so that they can be fast-pathed if a transport supports it. +func (r *Raft) processHeartbeat(rpc RPC) { + defer metrics.MeasureSince([]string{"raft", "rpc", "processHeartbeat"}, time.Now()) + + // Check if we are shutdown, just ignore the RPC + select { + case <-r.shutdownCh: + return + default: + } + + // Ensure we are only handling a heartbeat + switch cmd := rpc.Command.(type) { + case *AppendEntriesRequest: + r.appendEntries(rpc, cmd) + default: + r.logger.Printf("[ERR] raft: Expected heartbeat, got command: %#v", rpc.Command) + rpc.Respond(nil, fmt.Errorf("unexpected command")) + } +} + +// appendEntries is invoked when we get an append entries RPC call. +func (r *Raft) appendEntries(rpc RPC, a *AppendEntriesRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "appendEntries"}, time.Now()) + // Setup a response + resp := &AppendEntriesResponse{ + Term: r.getCurrentTerm(), + LastLog: r.getLastIndex(), + Success: false, + NoRetryBackoff: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if a.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one, also transition to follower + // if we ever get an appendEntries call + if a.Term > r.getCurrentTerm() || r.getState() != Follower { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(a.Term) + resp.Term = a.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(a.Leader)) + + // Verify the last log entry + if a.PrevLogEntry > 0 { + lastIdx, lastTerm := r.getLastEntry() + + var prevLogTerm uint64 + if a.PrevLogEntry == lastIdx { + prevLogTerm = lastTerm + + } else { + var prevLog Log + if err := r.logs.GetLog(a.PrevLogEntry, &prevLog); err != nil { + r.logger.Printf("[WARN] raft: Failed to get previous log: %d %v (last: %d)", + a.PrevLogEntry, err, lastIdx) + resp.NoRetryBackoff = true + return + } + prevLogTerm = prevLog.Term + } + + if a.PrevLogTerm != prevLogTerm { + r.logger.Printf("[WARN] raft: Previous log term mis-match: ours: %d remote: %d", + prevLogTerm, a.PrevLogTerm) + resp.NoRetryBackoff = true + return + } + } + + // Process any new entries + if n := len(a.Entries); n > 0 { + start := time.Now() + first := a.Entries[0] + last := a.Entries[n-1] + + // Delete any conflicting entries + lastLogIdx, _ := r.getLastLog() + if first.Index <= lastLogIdx { + r.logger.Printf("[WARN] raft: Clearing log suffix from %d to %d", first.Index, lastLogIdx) + if err := r.logs.DeleteRange(first.Index, lastLogIdx); err != nil { + r.logger.Printf("[ERR] raft: Failed to clear log suffix: %v", err) + return + } + } + + // Append the entry + if err := r.logs.StoreLogs(a.Entries); err != nil { + r.logger.Printf("[ERR] raft: Failed to append to logs: %v", err) + return + } + + // Update the lastLog + r.setLastLog(last.Index, last.Term) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "storeLogs"}, start) + } + + // Update the commit index + if a.LeaderCommitIndex > 0 && a.LeaderCommitIndex > r.getCommitIndex() { + start := time.Now() + idx := min(a.LeaderCommitIndex, r.getLastIndex()) + r.setCommitIndex(idx) + r.processLogs(idx, nil) + metrics.MeasureSince([]string{"raft", "rpc", "appendEntries", "processLogs"}, start) + } + + // Everything went well, set success + resp.Success = true + r.setLastContact() + return +} + +// requestVote is invoked when we get an request vote RPC call. +func (r *Raft) requestVote(rpc RPC, req *RequestVoteRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "requestVote"}, time.Now()) + r.observe(*req) + + // Setup a response + resp := &RequestVoteResponse{ + Term: r.getCurrentTerm(), + Peers: encodePeers(r.peers, r.trans), + Granted: false, + } + var rpcErr error + defer func() { + rpc.Respond(resp, rpcErr) + }() + + // Check if we have an existing leader [who's not the candidate] + candidate := r.trans.DecodePeer(req.Candidate) + if leader := r.Leader(); leader != "" && leader != candidate { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since we have a leader: %v", + candidate, leader) + return + } + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Check if we have voted yet + lastVoteTerm, err := r.stable.GetUint64(keyLastVoteTerm) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote term: %v", err) + return + } + lastVoteCandBytes, err := r.stable.Get(keyLastVoteCand) + if err != nil && err.Error() != "not found" { + r.logger.Printf("[ERR] raft: Failed to get last vote candidate: %v", err) + return + } + + // Check if we've voted in this election before + if lastVoteTerm == req.Term && lastVoteCandBytes != nil { + r.logger.Printf("[INFO] raft: Duplicate RequestVote for same term: %d", req.Term) + if bytes.Compare(lastVoteCandBytes, req.Candidate) == 0 { + r.logger.Printf("[WARN] raft: Duplicate RequestVote from candidate: %s", req.Candidate) + resp.Granted = true + } + return + } + + // Reject if their term is older + lastIdx, lastTerm := r.getLastEntry() + if lastTerm > req.LastLogTerm { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last term is greater (%d, %d)", + candidate, lastTerm, req.LastLogTerm) + return + } + + if lastTerm == req.LastLogTerm && lastIdx > req.LastLogIndex { + r.logger.Printf("[WARN] raft: Rejecting vote request from %v since our last index is greater (%d, %d)", + candidate, lastIdx, req.LastLogIndex) + return + } + + // Persist a vote for safety + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote: %v", err) + return + } + + resp.Granted = true + r.setLastContact() + return +} + +// installSnapshot is invoked when we get a InstallSnapshot RPC call. +// We must be in the follower state for this, since it means we are +// too far behind a leader for log replay. +func (r *Raft) installSnapshot(rpc RPC, req *InstallSnapshotRequest) { + defer metrics.MeasureSince([]string{"raft", "rpc", "installSnapshot"}, time.Now()) + // Setup a response + resp := &InstallSnapshotResponse{ + Term: r.getCurrentTerm(), + Success: false, + } + var rpcErr error + defer func() { + io.Copy(ioutil.Discard, rpc.Reader) // ensure we always consume all the snapshot data from the stream [see issue #212] + rpc.Respond(resp, rpcErr) + }() + + // Ignore an older term + if req.Term < r.getCurrentTerm() { + r.logger.Printf("[INFO] raft: Ignoring installSnapshot request with older term of %d vs currentTerm %d", req.Term, r.getCurrentTerm()) + return + } + + // Increase the term if we see a newer one + if req.Term > r.getCurrentTerm() { + // Ensure transition to follower + r.setState(Follower) + r.setCurrentTerm(req.Term) + resp.Term = req.Term + } + + // Save the current leader + r.setLeader(r.trans.DecodePeer(req.Leader)) + + // Create a new snapshot + sink, err := r.snapshots.Create(req.LastLogIndex, req.LastLogTerm, req.Peers) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to create snapshot to install: %v", err) + rpcErr = fmt.Errorf("failed to create snapshot: %v", err) + return + } + + // Spill the remote snapshot to disk + n, err := io.Copy(sink, rpc.Reader) + if err != nil { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to copy snapshot: %v", err) + rpcErr = err + return + } + + // Check that we received it all + if n != req.Size { + sink.Cancel() + r.logger.Printf("[ERR] raft: Failed to receive whole snapshot: %d / %d", n, req.Size) + rpcErr = fmt.Errorf("short read") + return + } + + // Finalize the snapshot + if err := sink.Close(); err != nil { + r.logger.Printf("[ERR] raft: Failed to finalize snapshot: %v", err) + rpcErr = err + return + } + r.logger.Printf("[INFO] raft: Copied %d bytes to local snapshot", n) + + // Restore snapshot + future := &restoreFuture{ID: sink.ID()} + future.init() + select { + case r.fsmRestoreCh <- future: + case <-r.shutdownCh: + future.respond(ErrRaftShutdown) + return + } + + // Wait for the restore to happen + if err := future.Error(); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot: %v", err) + rpcErr = err + return + } + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(req.LastLogIndex) + + // Update the last stable snapshot info + r.setLastSnapshot(req.LastLogIndex, req.LastLogTerm) + + // Restore the peer set + peers := decodePeers(req.Peers, r.trans) + r.peers = ExcludePeer(peers, r.localAddr) + r.peerStore.SetPeers(peers) + + // Compact logs, continue even if this fails + if err := r.compactLogs(req.LastLogIndex); err != nil { + r.logger.Printf("[ERR] raft: Failed to compact logs: %v", err) + } + + r.logger.Printf("[INFO] raft: Installed remote snapshot") + resp.Success = true + r.setLastContact() + return +} + +// setLastContact is used to set the last contact time to now +func (r *Raft) setLastContact() { + r.lastContactLock.Lock() + r.lastContact = time.Now() + r.lastContactLock.Unlock() +} + +type voteResult struct { + RequestVoteResponse + voter string +} + +// electSelf is used to send a RequestVote RPC to all peers, +// and vote for ourself. This has the side affecting of incrementing +// the current term. The response channel returned is used to wait +// for all the responses (including a vote for ourself). +func (r *Raft) electSelf() <-chan *voteResult { + // Create a response channel + respCh := make(chan *voteResult, len(r.peers)+1) + + // Increment the term + r.setCurrentTerm(r.getCurrentTerm() + 1) + + // Construct the request + lastIdx, lastTerm := r.getLastEntry() + req := &RequestVoteRequest{ + Term: r.getCurrentTerm(), + Candidate: r.trans.EncodePeer(r.localAddr), + LastLogIndex: lastIdx, + LastLogTerm: lastTerm, + } + + // Construct a function to ask for a vote + askPeer := func(peer string) { + r.goFunc(func() { + defer metrics.MeasureSince([]string{"raft", "candidate", "electSelf"}, time.Now()) + resp := &voteResult{voter: peer} + err := r.trans.RequestVote(peer, req, &resp.RequestVoteResponse) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to make RequestVote RPC to %v: %v", peer, err) + resp.Term = req.Term + resp.Granted = false + } + + // If we are not a peer, we could have been removed but failed + // to receive the log message. OR it could mean an improperly configured + // cluster. Either way, we should warn + if err == nil { + peerSet := decodePeers(resp.Peers, r.trans) + if !PeerContained(peerSet, r.localAddr) { + r.logger.Printf("[WARN] raft: Remote peer %v does not have local node %v as a peer", + peer, r.localAddr) + } + } + + respCh <- resp + }) + } + + // For each peer, request a vote + for _, peer := range r.peers { + askPeer(peer) + } + + // Persist a vote for ourselves + if err := r.persistVote(req.Term, req.Candidate); err != nil { + r.logger.Printf("[ERR] raft: Failed to persist vote : %v", err) + return nil + } + + // Include our own vote + respCh <- &voteResult{ + RequestVoteResponse: RequestVoteResponse{ + Term: req.Term, + Granted: true, + }, + voter: r.localAddr, + } + return respCh +} + +// persistVote is used to persist our vote for safety. +func (r *Raft) persistVote(term uint64, candidate []byte) error { + if err := r.stable.SetUint64(keyLastVoteTerm, term); err != nil { + return err + } + if err := r.stable.Set(keyLastVoteCand, candidate); err != nil { + return err + } + return nil +} + +// setCurrentTerm is used to set the current term in a durable manner. +func (r *Raft) setCurrentTerm(t uint64) { + // Persist to disk first + if err := r.stable.SetUint64(keyCurrentTerm, t); err != nil { + panic(fmt.Errorf("failed to save current term: %v", err)) + } + r.raftState.setCurrentTerm(t) +} + +// setState is used to update the current state. Any state +// transition causes the known leader to be cleared. This means +// that leader should be set only after updating the state. +func (r *Raft) setState(state RaftState) { + r.setLeader("") + oldState := r.raftState.getState() + r.raftState.setState(state) + if oldState != state { + r.observe(state) + } +} + +// runSnapshots is a long running goroutine used to manage taking +// new snapshots of the FSM. It runs in parallel to the FSM and +// main goroutines, so that snapshots do not block normal operation. +func (r *Raft) runSnapshots() { + for { + select { + case <-randomTimeout(r.conf.SnapshotInterval): + // Check if we should snapshot + if !r.shouldSnapshot() { + continue + } + + // Trigger a snapshot + if err := r.takeSnapshot(); err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + + case future := <-r.snapshotCh: + // User-triggered, run immediately + err := r.takeSnapshot() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to take snapshot: %v", err) + } + future.respond(err) + + case <-r.shutdownCh: + return + } + } +} + +// shouldSnapshot checks if we meet the conditions to take +// a new snapshot. +func (r *Raft) shouldSnapshot() bool { + // Check the last snapshot index + lastSnap, _ := r.getLastSnapshot() + + // Check the last log index + lastIdx, err := r.logs.LastIndex() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to get last log index: %v", err) + return false + } + + // Compare the delta to the threshold + delta := lastIdx - lastSnap + return delta >= r.conf.SnapshotThreshold +} + +// takeSnapshot is used to take a new snapshot. +func (r *Raft) takeSnapshot() error { + defer metrics.MeasureSince([]string{"raft", "snapshot", "takeSnapshot"}, time.Now()) + // Create a snapshot request + req := &reqSnapshotFuture{} + req.init() + + // Wait for dispatch or shutdown + select { + case r.fsmSnapshotCh <- req: + case <-r.shutdownCh: + return ErrRaftShutdown + } + + // Wait until we get a response + if err := req.Error(); err != nil { + if err != ErrNothingNewToSnapshot { + err = fmt.Errorf("failed to start snapshot: %v", err) + } + return err + } + defer req.snapshot.Release() + + // Log that we are starting the snapshot + r.logger.Printf("[INFO] raft: Starting snapshot up to %d", req.index) + + // Encode the peerset + peerSet := encodePeers(req.peers, r.trans) + + // Create a new snapshot + start := time.Now() + sink, err := r.snapshots.Create(req.index, req.term, peerSet) + if err != nil { + return fmt.Errorf("failed to create snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "create"}, start) + + // Try to persist the snapshot + start = time.Now() + if err := req.snapshot.Persist(sink); err != nil { + sink.Cancel() + return fmt.Errorf("failed to persist snapshot: %v", err) + } + metrics.MeasureSince([]string{"raft", "snapshot", "persist"}, start) + + // Close and check for error + if err := sink.Close(); err != nil { + return fmt.Errorf("failed to close snapshot: %v", err) + } + + // Update the last stable snapshot info + r.setLastSnapshot(req.index, req.term) + + // Compact the logs + if err := r.compactLogs(req.index); err != nil { + return err + } + + // Log completion + r.logger.Printf("[INFO] raft: Snapshot to %d complete", req.index) + return nil +} + +// compactLogs takes the last inclusive index of a snapshot +// and trims the logs that are no longer needed. +func (r *Raft) compactLogs(snapIdx uint64) error { + defer metrics.MeasureSince([]string{"raft", "compactLogs"}, time.Now()) + // Determine log ranges to compact + minLog, err := r.logs.FirstIndex() + if err != nil { + return fmt.Errorf("failed to get first log index: %v", err) + } + + // Check if we have enough logs to truncate + lastLogIdx, _ := r.getLastLog() + if lastLogIdx <= r.conf.TrailingLogs { + return nil + } + + // Truncate up to the end of the snapshot, or `TrailingLogs` + // back from the head, which ever is further back. This ensures + // at least `TrailingLogs` entries, but does not allow logs + // after the snapshot to be removed. + maxLog := min(snapIdx, lastLogIdx-r.conf.TrailingLogs) + + // Log this + r.logger.Printf("[INFO] raft: Compacting logs from %d to %d", minLog, maxLog) + + // Compact the logs + if err := r.logs.DeleteRange(minLog, maxLog); err != nil { + return fmt.Errorf("log compaction failed: %v", err) + } + return nil +} + +// restoreSnapshot attempts to restore the latest snapshots, and fails +// if none of them can be restored. This is called at initialization time, +// and is completely unsafe to call at any other time. +func (r *Raft) restoreSnapshot() error { + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return err + } + + // Try to load in order of newest to oldest + for _, snapshot := range snapshots { + _, source, err := r.snapshots.Open(snapshot.ID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapshot.ID, err) + continue + } + defer source.Close() + + if err := r.fsm.Restore(source); err != nil { + r.logger.Printf("[ERR] raft: Failed to restore snapshot %v: %v", snapshot.ID, err) + continue + } + + // Log success + r.logger.Printf("[INFO] raft: Restored from snapshot %v", snapshot.ID) + + // Update the lastApplied so we don't replay old logs + r.setLastApplied(snapshot.Index) + + // Update the last stable snapshot info + r.setLastSnapshot(snapshot.Index, snapshot.Term) + + // Success! + return nil + } + + // If we had snapshots and failed to load them, its an error + if len(snapshots) > 0 { + return fmt.Errorf("failed to load any existing snapshots") + } + return nil +} + +// StepDown instructs a leader to voluntarily step down, reentering election cycle. +// Note that the node may yet win elections again immediately following. +func (r *Raft) StepDown() error { + if r.getState() != Leader { + return fmt.Errorf("StepDown() is only applicable to the leader") + } + asyncNotifyCh(r.leaderState.stepDown) + return nil +} + +// Yield instructs the node to not attempt becoming a leader in the +// following duration. +func (r *Raft) Yield() error { + atomic.AddInt64(&r.suspendLeadership, 1) + yieldDuration := r.conf.HeartbeatTimeout * 5 // time enough for the yielded-to peer to become leader + go time.AfterFunc(yieldDuration, func() { + atomic.AddInt64(&r.suspendLeadership, -1) + }) + if r.getState() == Leader { + r.StepDown() + } + return nil +} diff --git a/go/vt/orchestrator/external/raft/raft_test.go b/go/vt/orchestrator/external/raft/raft_test.go new file mode 100644 index 0000000000..5eb660aede --- /dev/null +++ b/go/vt/orchestrator/external/raft/raft_test.go @@ -0,0 +1,1845 @@ +package raft + +import ( + "bytes" + "fmt" + "io" + "io/ioutil" + "log" + "os" + "reflect" + "sync" + "sync/atomic" + "testing" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +// MockFSM is an implementation of the FSM interface, and just stores +// the logs sequentially. +type MockFSM struct { + sync.Mutex + logs [][]byte +} + +type MockSnapshot struct { + logs [][]byte + maxIndex int +} + +func (m *MockFSM) Apply(log *Log) interface{} { + m.Lock() + defer m.Unlock() + m.logs = append(m.logs, log.Data) + return len(m.logs) +} + +func (m *MockFSM) Snapshot() (FSMSnapshot, error) { + m.Lock() + defer m.Unlock() + return &MockSnapshot{m.logs, len(m.logs)}, nil +} + +func (m *MockFSM) Restore(inp io.ReadCloser) error { + m.Lock() + defer m.Unlock() + defer inp.Close() + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(inp, &hd) + + m.logs = nil + return dec.Decode(&m.logs) +} + +func (m *MockSnapshot) Persist(sink SnapshotSink) error { + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(sink, &hd) + if err := enc.Encode(m.logs[:m.maxIndex]); err != nil { + sink.Cancel() + return err + } + sink.Close() + return nil +} + +func (m *MockSnapshot) Release() { +} + +// Return configurations optimized for in-memory +func inmemConfig(t *testing.T) *Config { + conf := DefaultConfig() + conf.HeartbeatTimeout = 50 * time.Millisecond + conf.ElectionTimeout = 50 * time.Millisecond + conf.LeaderLeaseTimeout = 50 * time.Millisecond + conf.CommitTimeout = 5 * time.Millisecond + conf.Logger = newTestLogger(t) + return conf +} + +// This can be used as the destination for a logger and it'll +// map them into calls to testing.T.Log, so that you only see +// the logging for failed tests. +type testLoggerAdapter struct { + t *testing.T + prefix string +} + +func (a *testLoggerAdapter) Write(d []byte) (int, error) { + if d[len(d)-1] == '\n' { + d = d[:len(d)-1] + } + if a.prefix != "" { + l := a.prefix + ": " + string(d) + a.t.Log(l) + return len(l), nil + } + + a.t.Log(string(d)) + return len(d), nil +} + +func newTestLogger(t *testing.T) *log.Logger { + return log.New(&testLoggerAdapter{t: t}, "", log.Lmicroseconds) +} + +func newTestLoggerWithPrefix(t *testing.T, prefix string) *log.Logger { + return log.New(&testLoggerAdapter{t: t, prefix: prefix}, "", log.Lmicroseconds) +} + +type cluster struct { + dirs []string + stores []*InmemStore + fsms []*MockFSM + snaps []*FileSnapshotStore + trans []LoopbackTransport + rafts []*Raft + t *testing.T + observationCh chan Observation + conf *Config + propagateTimeout time.Duration + longstopTimeout time.Duration + logger *log.Logger + startTime time.Time + + failedLock sync.Mutex + failedCh chan struct{} + failed bool +} + +func (c *cluster) Merge(other *cluster) { + c.dirs = append(c.dirs, other.dirs...) + c.stores = append(c.stores, other.stores...) + c.fsms = append(c.fsms, other.fsms...) + c.snaps = append(c.snaps, other.snaps...) + c.trans = append(c.trans, other.trans...) + c.rafts = append(c.rafts, other.rafts...) +} + +// notifyFailed will close the failed channel which can signal the goroutine +// running the test that another goroutine has detected a failure in order to +// terminate the test. +func (c *cluster) notifyFailed() { + c.failedLock.Lock() + defer c.failedLock.Unlock() + if !c.failed { + c.failed = true + close(c.failedCh) + } +} + +// Failf provides a logging function that fails the tests, prints the output +// with microseconds, and does not mysteriously eat the string. This can be +// safely called from goroutines but won't immediately halt the test. The +// failedCh will be closed to allow blocking functions in the main thread to +// detect the failure and react. Note that you should arrange for the main +// thread to block until all goroutines have completed in order to reliably +// fail tests using this function. +func (c *cluster) Failf(format string, args ...interface{}) { + c.logger.Printf(format, args...) + c.t.Fail() + c.notifyFailed() +} + +// FailNowf provides a logging function that fails the tests, prints the output +// with microseconds, and does not mysteriously eat the string. FailNowf must be +// called from the goroutine running the test or benchmark function, not from +// other goroutines created during the test. Calling FailNowf does not stop +// those other goroutines. +func (c *cluster) FailNowf(format string, args ...interface{}) { + c.logger.Printf(format, args...) + c.t.FailNow() +} + +// Close shuts down the cluster and cleans up. +func (c *cluster) Close() { + var futures []Future + for _, r := range c.rafts { + futures = append(futures, r.Shutdown()) + } + + // Wait for shutdown + limit := time.AfterFunc(c.longstopTimeout, func() { + // We can't FailNowf here, and c.Failf won't do anything if we + // hang, so panic. + panic("timed out waiting for shutdown") + }) + defer limit.Stop() + + for _, f := range futures { + if err := f.Error(); err != nil { + c.FailNowf("[ERR] shutdown future err: %v", err) + } + } + + for _, d := range c.dirs { + os.RemoveAll(d) + } +} + +// WaitEventChan returns a channel which will signal if an observation is made +// or a timeout occurs. It is possible to set a filter to look for specific +// observations. Setting timeout to 0 means that it will wait forever until a +// non-filtered observation is made. +func (c *cluster) WaitEventChan(filter FilterFn, timeout time.Duration) <-chan struct{} { + ch := make(chan struct{}) + go func() { + defer close(ch) + var timeoutCh <-chan time.Time + if timeout > 0 { + timeoutCh = time.After(timeout) + } + for { + select { + case <-timeoutCh: + return + + case o, ok := <-c.observationCh: + if !ok || filter == nil || filter(&o) { + return + } + } + } + }() + return ch +} + +// WaitEvent waits until an observation is made, a timeout occurs, or a test +// failure is signaled. It is possible to set a filter to look for specific +// observations. Setting timeout to 0 means that it will wait forever until a +// non-filtered observation is made or a test failure is signaled. +func (c *cluster) WaitEvent(filter FilterFn, timeout time.Duration) { + select { + case <-c.failedCh: + c.t.FailNow() + + case <-c.WaitEventChan(filter, timeout): + } +} + +// WaitForReplication blocks until every FSM in the cluster has the given +// length, or the long sanity check timeout expires. +func (c *cluster) WaitForReplication(fsmLength int) { + limitCh := time.After(c.longstopTimeout) + +CHECK: + for { + ch := c.WaitEventChan(nil, c.conf.CommitTimeout) + select { + case <-c.failedCh: + c.t.FailNow() + + case <-limitCh: + c.FailNowf("[ERR] Timeout waiting for replication") + + case <-ch: + for _, fsm := range c.fsms { + fsm.Lock() + num := len(fsm.logs) + fsm.Unlock() + if num != fsmLength { + continue CHECK + } + } + return + } + } +} + +// pollState takes a snapshot of the state of the cluster. This might not be +// stable, so use GetInState() to apply some additional checks when waiting +// for the cluster to achieve a particular state. +func (c *cluster) pollState(s RaftState) ([]*Raft, uint64) { + var highestTerm uint64 + in := make([]*Raft, 0, 1) + for _, r := range c.rafts { + if r.State() == s { + in = append(in, r) + } + term := r.getCurrentTerm() + if term > highestTerm { + highestTerm = term + } + } + return in, highestTerm +} + +// GetInState polls the state of the cluster and attempts to identify when it has +// settled into the given state. +func (c *cluster) GetInState(s RaftState) []*Raft { + c.logger.Printf("[INFO] Starting stability test for raft state: %+v", s) + limitCh := time.After(c.longstopTimeout) + + // An election should complete after 2 * max(HeartbeatTimeout, ElectionTimeout) + // because of the randomised timer expiring in 1 x interval ... 2 x interval. + // We add a bit for propagation delay. If the election fails (e.g. because + // two elections start at once), we will have got something through our + // observer channel indicating a different state (i.e. one of the nodes + // will have moved to candidate state) which will reset the timer. + // + // Because of an implementation peculiarity, it can actually be 3 x timeout. + timeout := c.conf.HeartbeatTimeout + if timeout < c.conf.ElectionTimeout { + timeout = c.conf.ElectionTimeout + } + timeout = 2*timeout + c.conf.CommitTimeout + timer := time.NewTimer(timeout) + defer timer.Stop() + + // Wait until we have a stable instate slice. Each time we see an + // observation a state has changed, recheck it and if it has changed, + // restart the timer. + var pollStartTime = time.Now() + for { + inState, highestTerm := c.pollState(s) + inStateTime := time.Now() + + // Sometimes this routine is called very early on before the + // rafts have started up. We then timeout even though no one has + // even started an election. So if the highest term in use is + // zero, we know there are no raft processes that have yet issued + // a RequestVote, and we set a long time out. This is fixed when + // we hear the first RequestVote, at which point we reset the + // timer. + if highestTerm == 0 { + timer.Reset(c.longstopTimeout) + } else { + timer.Reset(timeout) + } + + // Filter will wake up whenever we observe a RequestVote. + filter := func(ob *Observation) bool { + switch ob.Data.(type) { + case RaftState: + return true + case RequestVoteRequest: + return true + default: + return false + } + } + + select { + case <-c.failedCh: + c.t.FailNow() + + case <-limitCh: + c.FailNowf("[ERR] Timeout waiting for stable %s state", s) + + case <-c.WaitEventChan(filter, 0): + c.logger.Printf("[DEBUG] Resetting stability timeout") + + case t, ok := <-timer.C: + if !ok { + c.FailNowf("[ERR] Timer channel errored") + } + c.logger.Printf("[INFO] Stable state for %s reached at %s (%d nodes), %s from start of poll, %s from cluster start. Timeout at %s, %s after stability", + s, inStateTime, len(inState), inStateTime.Sub(pollStartTime), inStateTime.Sub(c.startTime), t, t.Sub(inStateTime)) + return inState + } + } +} + +// Leader waits for the cluster to elect a leader and stay in a stable state. +func (c *cluster) Leader() *Raft { + leaders := c.GetInState(Leader) + if len(leaders) != 1 { + c.FailNowf("[ERR] expected one leader: %v", leaders) + } + return leaders[0] +} + +// Followers waits for the cluster to have N-1 followers and stay in a stable +// state. +func (c *cluster) Followers() []*Raft { + expFollowers := len(c.rafts) - 1 + followers := c.GetInState(Follower) + if len(followers) != expFollowers { + c.FailNowf("[ERR] timeout waiting for %d followers (followers are %v)", expFollowers, followers) + } + return followers +} + +// FullyConnect connects all the transports together. +func (c *cluster) FullyConnect() { + c.logger.Printf("[DEBUG] Fully Connecting") + for i, t1 := range c.trans { + for j, t2 := range c.trans { + if i != j { + t1.Connect(t2.LocalAddr(), t2) + t2.Connect(t1.LocalAddr(), t1) + } + } + } +} + +// Disconnect disconnects all transports from the given address. +func (c *cluster) Disconnect(a string) { + c.logger.Printf("[DEBUG] Disconnecting %v", a) + for _, t := range c.trans { + if t.LocalAddr() == a { + t.DisconnectAll() + } else { + t.Disconnect(a) + } + } +} + +// IndexOf returns the index of the given raft instance. +func (c *cluster) IndexOf(r *Raft) int { + for i, n := range c.rafts { + if n == r { + return i + } + } + return -1 +} + +// EnsureLeader checks that ALL the nodes think the leader is the given expected +// leader. +func (c *cluster) EnsureLeader(t *testing.T, expect string) { + // We assume c.Leader() has been called already; now check all the rafts + // think the leader is correct + fail := false + for _, r := range c.rafts { + leader := r.Leader() + if leader != expect { + if leader == "" { + leader = "[none]" + } + if expect == "" { + c.logger.Printf("[ERR] Peer %s sees leader %v expected [none]", r, leader) + } else { + c.logger.Printf("[ERR] Peer %s sees leader %v expected %v", r, leader, expect) + } + fail = true + } + } + if fail { + c.FailNowf("[ERR] At least one peer has the wrong notion of leader") + } +} + +// EnsureSame makes sure all the FSMs have the same contents. +func (c *cluster) EnsureSame(t *testing.T) { + limit := time.Now().Add(c.longstopTimeout) + first := c.fsms[0] + +CHECK: + first.Lock() + for i, fsm := range c.fsms { + if i == 0 { + continue + } + fsm.Lock() + + if len(first.logs) != len(fsm.logs) { + fsm.Unlock() + if time.Now().After(limit) { + c.FailNowf("[ERR] FSM log length mismatch: %d %d", + len(first.logs), len(fsm.logs)) + } else { + goto WAIT + } + } + + for idx := 0; idx < len(first.logs); idx++ { + if bytes.Compare(first.logs[idx], fsm.logs[idx]) != 0 { + fsm.Unlock() + if time.Now().After(limit) { + c.FailNowf("[ERR] FSM log mismatch at index %d", idx) + } else { + goto WAIT + } + } + } + fsm.Unlock() + } + + first.Unlock() + return + +WAIT: + first.Unlock() + c.WaitEvent(nil, c.conf.CommitTimeout) + goto CHECK +} + +// raftToPeerSet returns the set of peers as a map. +func raftToPeerSet(r *Raft) map[string]struct{} { + peers := make(map[string]struct{}) + peers[r.localAddr] = struct{}{} + + raftPeers, _ := r.peerStore.Peers() + for _, p := range raftPeers { + peers[p] = struct{}{} + } + return peers +} + +// EnsureSamePeers makes sure all the rafts have the same set of peers. +func (c *cluster) EnsureSamePeers(t *testing.T) { + limit := time.Now().Add(c.longstopTimeout) + peerSet := raftToPeerSet(c.rafts[0]) + +CHECK: + for i, raft := range c.rafts { + if i == 0 { + continue + } + + otherSet := raftToPeerSet(raft) + if !reflect.DeepEqual(peerSet, otherSet) { + if time.Now().After(limit) { + c.FailNowf("[ERR] peer mismatch: %v %v", peerSet, otherSet) + } else { + goto WAIT + } + } + } + return + +WAIT: + c.WaitEvent(nil, c.conf.CommitTimeout) + goto CHECK +} + +// makeCluster will return a cluster with the given config and number of peers. +// If addPeers is true, they will be added into the peer store before starting, +// otherwise their transports will be wired up but they won't yet have configured +// each other. +func makeCluster(n int, addPeers bool, t *testing.T, conf *Config) *cluster { + if conf == nil { + conf = inmemConfig(t) + } + + c := &cluster{ + observationCh: make(chan Observation, 1024), + conf: conf, + // Propagation takes a maximum of 2 heartbeat timeouts (time to + // get a new heartbeat that would cause a commit) plus a bit. + propagateTimeout: conf.HeartbeatTimeout*2 + conf.CommitTimeout, + longstopTimeout: 5 * time.Second, + logger: newTestLoggerWithPrefix(t, "cluster"), + failedCh: make(chan struct{}), + } + c.t = t + peers := make([]string, 0, n) + + // Setup the stores and transports + for i := 0; i < n; i++ { + dir, err := ioutil.TempDir("", "raft") + if err != nil { + c.FailNowf("[ERR] err: %v ", err) + } + + store := NewInmemStore() + c.dirs = append(c.dirs, dir) + c.stores = append(c.stores, store) + c.fsms = append(c.fsms, &MockFSM{}) + + dir2, snap := FileSnapTest(t) + c.dirs = append(c.dirs, dir2) + c.snaps = append(c.snaps, snap) + + addr, trans := NewInmemTransport("") + c.trans = append(c.trans, trans) + peers = append(peers, addr) + } + + // Wire the transports together + c.FullyConnect() + + // Create all the rafts + c.startTime = time.Now() + for i := 0; i < n; i++ { + if n == 1 { + conf.EnableSingleNode = true + } + + logs := c.stores[i] + store := c.stores[i] + snap := c.snaps[i] + trans := c.trans[i] + + peerStore := &StaticPeers{} + if addPeers { + peerStore.StaticPeers = peers + } + peerConf := conf + peerConf.Logger = newTestLoggerWithPrefix(t, peers[i]) + + raft, err := NewRaft(peerConf, c.fsms[i], logs, store, snap, peerStore, trans) + if err != nil { + c.FailNowf("[ERR] NewRaft failed: %v", err) + } + + raft.RegisterObserver(NewObserver(c.observationCh, false, nil)) + if err != nil { + c.FailNowf("[ERR] RegisterObserver failed: %v", err) + } + c.rafts = append(c.rafts, raft) + } + + return c +} + +// See makeCluster. This adds the peers initially to the peer store. +func MakeCluster(n int, t *testing.T, conf *Config) *cluster { + return makeCluster(n, true, t, conf) +} + +// See makeCluster. This doesn't add the peers initially to the peer store. +func MakeClusterNoPeers(n int, t *testing.T, conf *Config) *cluster { + return makeCluster(n, false, t, conf) +} + +func TestRaft_StartStop(t *testing.T) { + c := MakeCluster(1, t, nil) + c.Close() +} + +func TestRaft_AfterShutdown(t *testing.T) { + c := MakeCluster(1, t, nil) + c.Close() + raft := c.rafts[0] + + // Everything should fail now + if f := raft.Apply(nil, 0); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.AddPeer(NewInmemAddr()); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.RemovePeer(NewInmemAddr()); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + if f := raft.Snapshot(); f.Error() != ErrRaftShutdown { + c.FailNowf("[ERR] should be shutdown: %v", f.Error()) + } + + // Should be idempotent + if f := raft.Shutdown(); f.Error() != nil { + c.FailNowf("[ERR] shutdown should be idempotent") + } + +} + +func TestRaft_SingleNode(t *testing.T) { + conf := inmemConfig(t) + c := MakeCluster(1, t, conf) + defer c.Close() + raft := c.rafts[0] + + // Watch leaderCh for change + select { + case v := <-raft.LeaderCh(): + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(conf.HeartbeatTimeout * 3): + c.FailNowf("[ERR] timeout becoming leader") + } + + // Should be leader + if s := raft.State(); s != Leader { + c.FailNowf("[ERR] expected leader: %v", s) + } + + // Should be able to apply + future := raft.Apply([]byte("test"), c.conf.HeartbeatTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check the response + if future.Response().(int) != 1 { + c.FailNowf("[ERR] bad response: %v", future.Response()) + } + + // Check the index + if idx := future.Index(); idx == 0 { + c.FailNowf("[ERR] bad index: %d", idx) + } + + // Check that it is applied to the FSM + if len(c.fsms[0].logs) != 1 { + c.FailNowf("[ERR] did not apply to FSM!") + } +} + +func TestRaft_TripleNode(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Should be one leader + c.Followers() + leader := c.Leader() + c.EnsureLeader(t, leader.localAddr) + + // Should be able to apply + future := leader.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.WaitForReplication(1) +} + +func TestRaft_LeaderFail(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Should be one leader + c.Followers() + leader := c.Leader() + + // Should be able to apply + future := leader.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.WaitForReplication(1) + + // Disconnect the leader now + t.Logf("[INFO] Disconnecting %v", leader) + leaderTerm := leader.getCurrentTerm() + c.Disconnect(leader.localAddr) + + // Wait for new leader + limit := time.Now().Add(c.longstopTimeout) + var newLead *Raft + for time.Now().Before(limit) && newLead == nil { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders := c.GetInState(Leader) + if len(leaders) == 1 && leaders[0] != leader { + newLead = leaders[0] + } + } + if newLead == nil { + c.FailNowf("[ERR] expected new leader") + } + + // Ensure the term is greater + if newLead.getCurrentTerm() <= leaderTerm { + c.FailNowf("[ERR] expected newer term! %d %d (%v, %v)", newLead.getCurrentTerm(), leaderTerm, newLead, leader) + } + + // Apply should work not work on old leader + future1 := leader.Apply([]byte("fail"), c.conf.CommitTimeout) + + // Apply should work on newer leader + future2 := newLead.Apply([]byte("apply"), c.conf.CommitTimeout) + + // Future2 should work + if err := future2.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Reconnect the networks + t.Logf("[INFO] Reconnecting %v", leader) + c.FullyConnect() + + // Future1 should fail + if err := future1.Error(); err != ErrLeadershipLost && err != ErrNotLeader { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait for log replication + c.EnsureSame(t) + + // Check two entries are applied to the FSM + for _, fsm := range c.fsms { + fsm.Lock() + if len(fsm.logs) != 2 { + c.FailNowf("[ERR] did not apply both to FSM! %v", fsm.logs) + } + if bytes.Compare(fsm.logs[0], []byte("test")) != 0 { + c.FailNowf("[ERR] first entry should be 'test'") + } + if bytes.Compare(fsm.logs[1], []byte("apply")) != 0 { + c.FailNowf("[ERR] second entry should be 'apply'") + } + fsm.Unlock() + } +} + +func TestRaft_BehindFollower(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Disconnect one follower + leader := c.Leader() + followers := c.Followers() + behind := followers[0] + c.Disconnect(behind.localAddr) + + // Commit a lot of things + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Finished apply without behind follower") + } + + // Check that we have a non zero last contact + if behind.LastContact().IsZero() { + c.FailNowf("[ERR] expected previous contact") + } + + // Reconnect the behind node + c.FullyConnect() + + // Ensure all the logs are the same + c.EnsureSame(t) + + // Ensure one leader + leader = c.Leader() + c.EnsureLeader(t, leader.localAddr) +} + +func TestRaft_ApplyNonLeader(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Wait for a leader + c.Leader() + + // Try to apply to them + followers := c.GetInState(Follower) + if len(followers) != 2 { + c.FailNowf("[ERR] Expected 2 followers") + } + follower := followers[0] + + // Try to apply + future := follower.Apply([]byte("test"), c.conf.CommitTimeout) + if future.Error() != ErrNotLeader { + c.FailNowf("[ERR] should not apply on follower") + } + + // Should be cached + if future.Error() != ErrNotLeader { + c.FailNowf("[ERR] should not apply on follower") + } +} + +func TestRaft_ApplyConcurrent(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.HeartbeatTimeout = 2 * conf.HeartbeatTimeout + conf.ElectionTimeout = 2 * conf.ElectionTimeout + c := MakeCluster(3, t, conf) + defer c.Close() + + // Wait for a leader + leader := c.Leader() + + // Create a wait group + const sz = 100 + var group sync.WaitGroup + group.Add(sz) + + applyF := func(i int) { + defer group.Done() + future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + if err := future.Error(); err != nil { + c.Failf("[ERR] err: %v", err) + } + } + + // Concurrently apply + for i := 0; i < sz; i++ { + go applyF(i) + } + + // Wait to finish + doneCh := make(chan struct{}) + go func() { + group.Wait() + close(doneCh) + }() + select { + case <-doneCh: + case <-time.After(c.longstopTimeout): + c.FailNowf("[ERR] timeout") + } + + // If anything failed up to this point then bail now, rather than do a + // confusing compare. + if t.Failed() { + c.FailNowf("[ERR] One or more of the apply operations failed") + } + + // Check the FSMs + c.EnsureSame(t) +} + +func TestRaft_ApplyConcurrent_Timeout(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.CommitTimeout = 1 * time.Millisecond + conf.HeartbeatTimeout = 2 * conf.HeartbeatTimeout + conf.ElectionTimeout = 2 * conf.ElectionTimeout + c := MakeCluster(1, t, conf) + defer c.Close() + + // Wait for a leader + leader := c.Leader() + + // Enough enqueues should cause at least one timeout... + var didTimeout int32 + for i := 0; (i < 5000) && (atomic.LoadInt32(&didTimeout) == 0); i++ { + go func(i int) { + future := leader.Apply([]byte(fmt.Sprintf("test%d", i)), time.Microsecond) + if future.Error() == ErrEnqueueTimeout { + atomic.StoreInt32(&didTimeout, 1) + } + }(i) + + // Give the leader loop some other things to do in order to + // increase the odds of a timeout. + if i%5 == 0 { + leader.VerifyLeader() + } + } + + // Loop until we see a timeout, or give up. + limit := time.Now().Add(c.longstopTimeout) + for time.Now().Before(limit) { + if atomic.LoadInt32(&didTimeout) != 0 { + return + } + c.WaitEvent(nil, c.propagateTimeout) + } + c.FailNowf("[ERR] Timeout waiting to detect apply timeouts") +} + +func TestRaft_JoinNode(t *testing.T) { + // Make a cluster + c := MakeCluster(2, t, nil) + defer c.Close() + + // Apply a log to this cluster to ensure it is 'newer' + var future Future + leader := c.Leader() + future = leader.Apply([]byte("first"), 0) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Applied log") + } + + // Make a new cluster of 1 + c1 := MakeCluster(1, t, nil) + + // Merge clusters + c.Merge(c1) + c.FullyConnect() + + // Wait until we have 2 leaders + limit := time.Now().Add(c.longstopTimeout) + var leaders []*Raft + for time.Now().Before(limit) && len(leaders) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders = c.GetInState(Leader) + } + if len(leaders) != 2 { + c.FailNowf("[ERR] expected two leader: %v", leaders) + } + + // Join the new node in + future = leader.AddPeer(c1.rafts[0].localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait until we have 2 followers + limit = time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Check the FSMs + c.EnsureSame(t) + + // Check the peers + c.EnsureSamePeers(t) + + // Ensure one leader + leader = c.Leader() + c.EnsureLeader(t, leader.localAddr) +} + +func TestRaft_RemoveFollower(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove a follower + follower := followers[0] + future := leader.RemovePeer(follower.localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } +} + +func TestRaft_RemoveLeader(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove the leader + leader.RemovePeer(leader.localAddr) + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + newLeader := c.Leader() + + // Wait a bit for log application + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := newLeader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + + // Old leader should be shutdown + if leader.State() != Shutdown { + c.FailNowf("[ERR] leader should be shutdown") + } + + // Old leader should have no peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 1 { + c.FailNowf("[ERR] leader should have no peers") + } +} + +func TestRaft_RemoveLeader_NoShutdown(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + conf.ShutdownOnRemove = false + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + c.Followers() + leader := c.Leader() + + // Remove the leader + var removeFuture Future + for i := byte(0); i < 100; i++ { + future := leader.Apply([]byte{i}, 0) + if i == 80 { + removeFuture = leader.RemovePeer(leader.localAddr) + } + if i > 80 { + if err := future.Error(); err == nil || err != ErrNotLeader { + c.FailNowf("[ERR] err: %v, future entries should fail", err) + } + } + } + + if err := removeFuture.Error(); err != nil { + c.FailNowf("[ERR] RemovePeer failed with error %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + newLeader := c.Leader() + + // Wait a bit for log application + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := newLeader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers") + } + + // Old leader should be a follower + if leader.State() != Follower { + c.FailNowf("[ERR] leader should be shutdown") + } + + // Old leader should have no peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 1 { + c.FailNowf("[ERR] leader should have no peers") + } + + // Other nodes should have the same state + c.EnsureSame(t) +} + +func TestRaft_RemoveLeader_SplitCluster(t *testing.T) { + // Enable operation after a remove + conf := inmemConfig(t) + conf.EnableSingleNode = true + conf.ShutdownOnRemove = false + conf.DisableBootstrapAfterElect = false + + // Make a cluster + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + c.Followers() + leader := c.Leader() + + // Remove the leader + leader.RemovePeer(leader.localAddr) + + // Wait until we have 2 leaders + limit := time.Now().Add(c.longstopTimeout) + var leaders []*Raft + for time.Now().Before(limit) && len(leaders) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + leaders = c.GetInState(Leader) + } + if len(leaders) != 2 { + c.FailNowf("[ERR] expected two leader: %v", leaders) + } + + // Old leader should have no peers + if len(leader.peers) != 0 { + c.FailNowf("[ERR] leader should have no peers") + } +} + +func TestRaft_AddKnownPeer(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + followers := c.GetInState(Follower) + + // Add a follower + future := leader.AddPeer(followers[0].localAddr) + + // Should be already added + if err := future.Error(); err != ErrKnownPeer { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_RemoveUnknownPeer(t *testing.T) { + // Make a cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Remove unknown + future := leader.RemovePeer(NewInmemAddr()) + + // Should be already added + if err := future.Error(); err != ErrUnknownPeer { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_SnapshotRestore(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Take a snapshot + snapFuture := leader.Snapshot() + if err := snapFuture.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check for snapshot + if snaps, _ := leader.snapshots.List(); len(snaps) != 1 { + c.FailNowf("[ERR] should have a snapshot") + } + + // Logs should be trimmed + if idx, _ := leader.logs.FirstIndex(); idx != 92 { + c.FailNowf("[ERR] should trim logs to 92: %d", idx) + } + + // Shutdown + shutdown := leader.Shutdown() + if err := shutdown.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Restart the Raft + r := leader + // Can't just reuse the old transport as it will be closed + _, trans2 := NewInmemTransport(r.trans.LocalAddr()) + r, err := NewRaft(r.conf, r.fsm, r.logs, r.stable, + r.snapshots, r.peerStore, trans2) + if err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.rafts[0] = r + + // We should have restored from the snapshot! + if last := r.getLastApplied(); last != 101 { + c.FailNowf("[ERR] bad last: %v", last) + } +} + +func TestRaft_SnapshotRestore_PeerChange(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(3, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Take a snapshot + snapFuture := leader.Snapshot() + if err := snapFuture.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Shutdown + shutdown := leader.Shutdown() + if err := shutdown.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Make a separate cluster + c2 := MakeClusterNoPeers(2, t, conf) + defer c2.Close() + + // Kill the old cluster + for _, sec := range c.rafts { + if sec != leader { + sec.Shutdown() + } + } + + // Change the peer addresses + peers := []string{leader.trans.LocalAddr()} + for _, sec := range c2.rafts { + peers = append(peers, sec.trans.LocalAddr()) + } + + // Restart the Raft with new peers + r := leader + peerStore := &StaticPeers{StaticPeers: peers} + // Can't just reuse the old transport as it will be closed + _, trans2 := NewInmemTransport(r.trans.LocalAddr()) + r, err := NewRaft(r.conf, r.fsm, r.logs, r.stable, + r.snapshots, peerStore, trans2) + if err != nil { + c.FailNowf("[ERR] err: %v", err) + } + c.rafts[0] = r + c2.rafts = append(c2.rafts, r) + c2.trans = append(c2.trans, r.trans.(*InmemTransport)) + c2.fsms = append(c2.fsms, r.fsm.(*MockFSM)) + c2.FullyConnect() + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Ensure we elect a leader, and that we replicate + // to our new followers + c2.EnsureSame(t) + + // We should have restored from the snapshot! + if last := r.getLastApplied(); last != 102 { + c.FailNowf("[ERR] bad last: %v", last) + } +} + +func TestRaft_AutoSnapshot(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.SnapshotInterval = conf.CommitTimeout * 2 + conf.SnapshotThreshold = 50 + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + // Commit a lot of things + leader := c.Leader() + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait for a snapshot to happen + time.Sleep(c.propagateTimeout) + + // Check for snapshot + if snaps, _ := leader.snapshots.List(); len(snaps) == 0 { + c.FailNowf("[ERR] should have a snapshot") + } +} + +func TestRaft_ManualSnapshot(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.SnapshotThreshold = 50 + conf.TrailingLogs = 10 + c := MakeCluster(1, t, conf) + defer c.Close() + + leader := c.Leader() + // with nothing commited, asking for a snapshot should return an error + ssErr := leader.Snapshot().Error() + if ssErr != ErrNothingNewToSnapshot { + t.Errorf("Attempt to manualy create snapshot should of errored because there's nothing to do: %v", ssErr) + } + // commit some things + var future Future + for i := 0; i < 10; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test %d", i)), 0) + } + if err := future.Error(); err != nil { + c.FailNowf("[ERR] Error Apply new log entries: %v", err) + } + // now we should be able to ask for a snapshot without getting an error + ssErr = leader.Snapshot().Error() + if ssErr != nil { + t.Errorf("Request for Snapshot failed: %v", ssErr) + } +} + +func TestRaft_SendSnapshotFollower(t *testing.T) { + // Make the cluster + conf := inmemConfig(t) + conf.TrailingLogs = 10 + c := MakeCluster(3, t, conf) + defer c.Close() + + // Disconnect one follower + followers := c.Followers() + leader := c.Leader() + behind := followers[0] + c.Disconnect(behind.localAddr) + + // Commit a lot of things + var future Future + for i := 0; i < 100; i++ { + future = leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for the last future to apply + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } else { + t.Logf("[INFO] Finished apply without behind follower") + } + + // Snapshot, this will truncate logs! + for _, r := range c.rafts { + future = r.Snapshot() + // the disconnected node will have nothing to snapshot, so that's expected + if err := future.Error(); err != nil && err != ErrNothingNewToSnapshot { + c.FailNowf("[ERR] err: %v", err) + } + } + + // Reconnect the behind node + c.FullyConnect() + + // Ensure all the logs are the same + c.EnsureSame(t) +} + +func TestRaft_ReJoinFollower(t *testing.T) { + // Enable operation after a remove + conf := inmemConfig(t) + conf.ShutdownOnRemove = false + + // Make a cluster + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have 2 followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers: %v", followers) + } + + // Remove a follower + follower := followers[0] + future := leader.RemovePeer(follower.localAddr) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers: %v", peers) + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 2 { + c.FailNowf("[ERR] too many peers: %v", peers) + } + + // Get the leader + time.Sleep(c.propagateTimeout) + leader = c.Leader() + + // Rejoin. The follower will have a higher term than the leader, + // this will cause the leader to step down, and a new round of elections + // to take place. We should eventually re-stabilize. + future = leader.AddPeer(follower.localAddr) + if err := future.Error(); err != nil && err != ErrLeadershipLost { + c.FailNowf("[ERR] err: %v", err) + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Other nodes should have fewer peers + if peers, _ := leader.peerStore.Peers(); len(peers) != 3 { + c.FailNowf("[ERR] missing peers: %v", peers) + } + if peers, _ := followers[1].peerStore.Peers(); len(peers) != 3 { + c.FailNowf("[ERR] missing peers: %v", peers) + } + + // Should be a follower now + if follower.State() != Follower { + c.FailNowf("[ERR] bad state: %v", follower.State()) + } +} + +func TestRaft_LeaderLeaseExpire(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(2, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 1 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 1 { + c.FailNowf("[ERR] expected a followers: %v", followers) + } + + // Disconnect the follower now + follower := followers[0] + t.Logf("[INFO] Disconnecting %v", follower) + c.Disconnect(follower.localAddr) + + // Watch the leaderCh + select { + case v := <-leader.LeaderCh(): + if v { + c.FailNowf("[ERR] should step down as leader") + } + case <-time.After(conf.LeaderLeaseTimeout * 2): + c.FailNowf("[ERR] timeout stepping down as leader") + } + + // Ensure the last contact of the leader is non-zero + if leader.LastContact().IsZero() { + c.FailNowf("[ERR] expected non-zero contact time") + } + + // Should be no leaders + if len(c.GetInState(Leader)) != 0 { + c.FailNowf("[ERR] expected step down") + } + + // Verify no further contact + last := follower.LastContact() + time.Sleep(c.propagateTimeout) + + // Check that last contact has not changed + if last != follower.LastContact() { + c.FailNowf("[ERR] unexpected further contact") + } + + // Ensure both have cleared their leader + if l := leader.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } + if l := follower.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } +} + +func TestRaft_Barrier(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Commit a lot of things + for i := 0; i < 100; i++ { + leader.Apply([]byte(fmt.Sprintf("test%d", i)), 0) + } + + // Wait for a barrier complete + barrier := leader.Barrier(0) + + // Wait for the barrier future to apply + if err := barrier.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Ensure all the logs are the same + c.EnsureSame(t) + if len(c.fsms[0].logs) != 100 { + c.FailNowf("[ERR] Bad log length") + } +} + +func TestRaft_VerifyLeader(t *testing.T) { + // Make the cluster + c := MakeCluster(3, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the verify to apply + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_VerifyLeader_Single(t *testing.T) { + // Make the cluster + c := MakeCluster(1, t, nil) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the verify to apply + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_VerifyLeader_Fail(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(2, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + followers := c.Followers() + + // Force follower to different term + follower := followers[0] + follower.setCurrentTerm(follower.getCurrentTerm() + 1) + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the leader to step down + if err := verify.Error(); err != ErrNotLeader && err != ErrLeadershipLost { + c.FailNowf("[ERR] err: %v", err) + } + + // Ensure the known leader is cleared + if l := leader.Leader(); l != "" { + c.FailNowf("[ERR] bad: %v", l) + } +} + +func TestRaft_VerifyLeader_ParitalConnect(t *testing.T) { + // Make a cluster + conf := inmemConfig(t) + c := MakeCluster(3, t, conf) + defer c.Close() + + // Get the leader + leader := c.Leader() + + // Wait until we have a followers + limit := time.Now().Add(c.longstopTimeout) + var followers []*Raft + for time.Now().Before(limit) && len(followers) != 2 { + c.WaitEvent(nil, c.conf.CommitTimeout) + followers = c.GetInState(Follower) + } + if len(followers) != 2 { + c.FailNowf("[ERR] expected two followers but got: %v", followers) + } + + // Force partial disconnect + follower := followers[0] + t.Logf("[INFO] Disconnecting %v", follower) + c.Disconnect(follower.localAddr) + + // Verify we are leader + verify := leader.VerifyLeader() + + // Wait for the leader to step down + if err := verify.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } +} + +func TestRaft_SettingPeers(t *testing.T) { + // Make the cluster + c := MakeClusterNoPeers(3, t, nil) + defer c.Close() + + peers := make([]string, 0, len(c.rafts)) + for _, v := range c.rafts { + peers = append(peers, v.localAddr) + } + + for _, v := range c.rafts { + future := v.SetPeers(peers) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] error setting peers: %v", err) + } + } + + // Wait a while + time.Sleep(c.propagateTimeout) + + // Should have a new leader + if leader := c.Leader(); leader == nil { + c.FailNowf("[ERR] no leader?") + } +} + +func TestRaft_StartAsLeader(t *testing.T) { + conf := inmemConfig(t) + conf.StartAsLeader = true + c := MakeCluster(1, t, conf) + defer c.Close() + raft := c.rafts[0] + + // Watch leaderCh for change + select { + case v := <-raft.LeaderCh(): + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(c.conf.HeartbeatTimeout * 4): + // Longer than you think as possibility of multiple elections + c.FailNowf("[ERR] timeout becoming leader") + } + + // Should be leader + if s := raft.State(); s != Leader { + c.FailNowf("[ERR] expected leader: %v", s) + } + + // Should be able to apply + future := raft.Apply([]byte("test"), c.conf.CommitTimeout) + if err := future.Error(); err != nil { + c.FailNowf("[ERR] err: %v", err) + } + + // Check the response + if future.Response().(int) != 1 { + c.FailNowf("[ERR] bad response: %v", future.Response()) + } + + // Check the index + if idx := future.Index(); idx == 0 { + c.FailNowf("[ERR] bad index: %d", idx) + } + + // Check that it is applied to the FSM + if len(c.fsms[0].logs) != 1 { + c.FailNowf("[ERR] did not apply to FSM!") + } +} + +func TestRaft_NotifyCh(t *testing.T) { + ch := make(chan bool, 1) + conf := inmemConfig(t) + conf.NotifyCh = ch + c := MakeCluster(1, t, conf) + defer c.Close() + + // Watch leaderCh for change + select { + case v := <-ch: + if !v { + c.FailNowf("[ERR] should become leader") + } + case <-time.After(conf.HeartbeatTimeout * 8): + c.FailNowf("[ERR] timeout becoming leader") + } + + // Close the cluster + c.Close() + + // Watch leaderCh for change + select { + case v := <-ch: + if v { + c.FailNowf("[ERR] should step down as leader") + } + case <-time.After(conf.HeartbeatTimeout * 6): + c.FailNowf("[ERR] timeout on step down as leader") + } +} + +func TestRaft_Voting(t *testing.T) { + c := MakeCluster(3, t, nil) + defer c.Close() + followers := c.Followers() + ldr := c.Leader() + ldrT := c.trans[c.IndexOf(ldr)] + + reqVote := RequestVoteRequest{ + Term: 42, + Candidate: ldrT.EncodePeer(ldr.localAddr), + LastLogIndex: ldr.LastIndex(), + LastLogTerm: 1, + } + // a follower that thinks there's a leader should vote for that leader. + var resp RequestVoteResponse + if err := ldrT.RequestVote(followers[0].localAddr, &reqVote, &resp); err != nil { + c.FailNowf("[ERR] RequestVote RPC failed %v", err) + } + if !resp.Granted { + c.FailNowf("[ERR] expected vote to be granted, but wasn't %+v", resp) + } + // a follow that thinks there's a leader shouldn't vote for a different candidate + reqVote.Candidate = ldrT.EncodePeer(followers[0].localAddr) + if err := ldrT.RequestVote(followers[1].localAddr, &reqVote, &resp); err != nil { + c.FailNowf("[ERR] RequestVote RPC failed %v", err) + } + if resp.Granted { + c.FailNowf("[ERR] expected vote not to be granted, but was %+v", resp) + } +} diff --git a/go/vt/orchestrator/external/raft/replication.go b/go/vt/orchestrator/external/raft/replication.go new file mode 100644 index 0000000000..1f8b923cd8 --- /dev/null +++ b/go/vt/orchestrator/external/raft/replication.go @@ -0,0 +1,522 @@ +package raft + +import ( + "errors" + "fmt" + "sync" + "time" + + "github.com/armon/go-metrics" +) + +const ( + maxFailureScale = 12 + failureWait = 10 * time.Millisecond +) + +var ( + // ErrLogNotFound indicates a given log entry is not available. + ErrLogNotFound = errors.New("log not found") + + // ErrPipelineReplicationNotSupported can be returned by the transport to + // signal that pipeline replication is not supported in general, and that + // no error message should be produced. + ErrPipelineReplicationNotSupported = errors.New("pipeline replication not supported") +) + +type followerReplication struct { + peer string + inflight *inflight + + stopCh chan uint64 + triggerCh chan struct{} + + currentTerm uint64 + matchIndex uint64 + nextIndex uint64 + + lastContact time.Time + lastContactLock sync.RWMutex + + failures uint64 + + notifyCh chan struct{} + notify []*verifyFuture + notifyLock sync.Mutex + + // stepDown is used to indicate to the leader that we + // should step down based on information from a follower. + stepDown chan struct{} + + // allowPipeline is used to control it seems like + // pipeline replication should be enabled. + allowPipeline bool +} + +// notifyAll is used to notify all the waiting verify futures +// if the follower believes we are still the leader. +func (s *followerReplication) notifyAll(leader bool) { + // Clear the waiting notifies minimizing lock time + s.notifyLock.Lock() + n := s.notify + s.notify = nil + s.notifyLock.Unlock() + + // Submit our votes + for _, v := range n { + v.vote(leader) + } +} + +// LastContact returns the time of last contact. +func (s *followerReplication) LastContact() time.Time { + s.lastContactLock.RLock() + last := s.lastContact + s.lastContactLock.RUnlock() + return last +} + +// setLastContact sets the last contact to the current time. +func (s *followerReplication) setLastContact() { + s.lastContactLock.Lock() + s.lastContact = time.Now() + s.lastContactLock.Unlock() +} + +// replicate is a long running routine that is used to manage +// the process of replicating logs to our followers. +func (r *Raft) replicate(s *followerReplication) { + // Start an async heartbeating routing + stopHeartbeat := make(chan struct{}) + defer close(stopHeartbeat) + r.goFunc(func() { r.heartbeat(s, stopHeartbeat) }) + +RPC: + shouldStop := false + for !shouldStop { + select { + case maxIndex := <-s.stopCh: + // Make a best effort to replicate up to this index + if maxIndex > 0 { + r.replicateTo(s, maxIndex) + } + return + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.replicateTo(s, lastLogIdx) + } + + // If things looks healthy, switch to pipeline mode + if !shouldStop && s.allowPipeline { + goto PIPELINE + } + } + return + +PIPELINE: + // Disable until re-enabled + s.allowPipeline = false + + // Replicates using a pipeline for high performance. This method + // is not able to gracefully recover from errors, and so we fall back + // to standard mode on failure. + if err := r.pipelineReplicate(s); err != nil { + if err != ErrPipelineReplicationNotSupported { + r.logger.Printf("[ERR] raft: Failed to start pipeline replication to %s: %s", s.peer, err) + } + } + goto RPC +} + +// replicateTo is used to replicate the logs up to a given last index. +// If the follower log is behind, we take care to bring them up to date. +func (r *Raft) replicateTo(s *followerReplication, lastIndex uint64) (shouldStop bool) { + // Create the base request + var req AppendEntriesRequest + var resp AppendEntriesResponse + var start time.Time +START: + // Prevent an excessive retry rate on errors + if s.failures > 0 { + select { + case <-time.After(backoff(failureWait, s.failures, maxFailureScale)): + case <-r.shutdownCh: + } + } + + // Setup the request + if err := r.setupAppendEntries(s, &req, s.nextIndex, lastIndex); err == ErrLogNotFound { + goto SEND_SNAP + } else if err != nil { + return + } + + // Make the RPC call + start = time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to AppendEntries to %v: %v", s.peer, err) + s.failures++ + return + } + appendStats(s.peer, start, float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true + } + + // Update the last contact + s.setLastContact() + + // Update s based on success + if resp.Success { + // Update our replication state + updateLastAppended(s, &req) + + // Clear any failures, allow pipelining + s.failures = 0 + s.allowPipeline = true + } else { + s.nextIndex = max(min(s.nextIndex-1, resp.LastLog+1), 1) + s.matchIndex = s.nextIndex - 1 + if resp.NoRetryBackoff { + s.failures = 0 + } else { + s.failures++ + } + r.logger.Printf("[WARN] raft: AppendEntries to %v rejected, sending older logs (next: %d)", s.peer, s.nextIndex) + } + +CHECK_MORE: + // Check if there are more logs to replicate + if s.nextIndex <= lastIndex { + goto START + } + return + + // SEND_SNAP is used when we fail to get a log, usually because the follower + // is too far behind, and we must ship a snapshot down instead +SEND_SNAP: + if stop, err := r.sendLatestSnapshot(s); stop { + return true + } else if err != nil { + r.logger.Printf("[ERR] raft: Failed to send snapshot to %v: %v", s.peer, err) + return + } + + // Check if there is more to replicate + goto CHECK_MORE +} + +// sendLatestSnapshot is used to send the latest snapshot we have +// down to our follower. +func (r *Raft) sendLatestSnapshot(s *followerReplication) (bool, error) { + // Get the snapshots + snapshots, err := r.snapshots.List() + if err != nil { + r.logger.Printf("[ERR] raft: Failed to list snapshots: %v", err) + return false, err + } + + // Check we have at least a single snapshot + if len(snapshots) == 0 { + return false, fmt.Errorf("no snapshots found") + } + + // Open the most recent snapshot + snapID := snapshots[0].ID + meta, snapshot, err := r.snapshots.Open(snapID) + if err != nil { + r.logger.Printf("[ERR] raft: Failed to open snapshot %v: %v", snapID, err) + return false, err + } + defer snapshot.Close() + + // Setup the request + req := InstallSnapshotRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + LastLogIndex: meta.Index, + LastLogTerm: meta.Term, + Peers: meta.Peers, + Size: meta.Size, + } + + // Make the call + start := time.Now() + var resp InstallSnapshotResponse + if err := r.trans.InstallSnapshot(s.peer, &req, &resp, snapshot); err != nil { + r.logger.Printf("[ERR] raft: Failed to install snapshot %v: %v", snapID, err) + s.failures++ + return false, err + } + metrics.MeasureSince([]string{"raft", "replication", "installSnapshot", s.peer}, start) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return true, nil + } + + // Update the last contact + s.setLastContact() + + // Check for success + if resp.Success { + // Mark any inflight logs as committed + s.inflight.CommitRange(s.matchIndex+1, meta.Index) + + // Update the indexes + s.matchIndex = meta.Index + s.nextIndex = s.matchIndex + 1 + + // Clear any failures + s.failures = 0 + + // Notify we are still leader + s.notifyAll(true) + } else { + s.failures++ + r.logger.Printf("[WARN] raft: InstallSnapshot to %v rejected", s.peer) + } + return false, nil +} + +// heartbeat is used to periodically invoke AppendEntries on a peer +// to ensure they don't time out. This is done async of replicate(), +// since that routine could potentially be blocked on disk IO. +func (r *Raft) heartbeat(s *followerReplication, stopCh chan struct{}) { + var failures uint64 + req := AppendEntriesRequest{ + Term: s.currentTerm, + Leader: r.trans.EncodePeer(r.localAddr), + } + var resp AppendEntriesResponse + for { + // Wait for the next heartbeat interval or forced notify + select { + case <-s.notifyCh: + case <-randomTimeout(r.conf.HeartbeatTimeout / 10): + case <-stopCh: + return + } + + start := time.Now() + if err := r.trans.AppendEntries(s.peer, &req, &resp); err != nil { + r.logger.Printf("[ERR] raft: Failed to heartbeat to %v: %v", s.peer, err) + failures++ + select { + case <-time.After(backoff(failureWait, failures, maxFailureScale)): + case <-stopCh: + } + } else { + s.setLastContact() + failures = 0 + metrics.MeasureSince([]string{"raft", "replication", "heartbeat", s.peer}, start) + s.notifyAll(resp.Success) + } + } +} + +// pipelineReplicate is used when we have synchronized our state with the follower, +// and want to switch to a higher performance pipeline mode of replication. +// We only pipeline AppendEntries commands, and if we ever hit an error, we fall +// back to the standard replication which can handle more complex situations. +func (r *Raft) pipelineReplicate(s *followerReplication) error { + // Create a new pipeline + pipeline, err := r.trans.AppendEntriesPipeline(s.peer) + if err != nil { + return err + } + defer pipeline.Close() + + // Log start and stop of pipeline + r.logger.Printf("[INFO] raft: pipelining replication to peer %v", s.peer) + defer r.logger.Printf("[INFO] raft: aborting pipeline replication to peer %v", s.peer) + + // Create a shutdown and finish channel + stopCh := make(chan struct{}) + finishCh := make(chan struct{}) + + // Start a dedicated decoder + r.goFunc(func() { r.pipelineDecode(s, pipeline, stopCh, finishCh) }) + + // Start pipeline sends at the last good nextIndex + nextIndex := s.nextIndex + + shouldStop := false +SEND: + for !shouldStop { + select { + case <-finishCh: + break SEND + case maxIndex := <-s.stopCh: + if maxIndex > 0 { + r.pipelineSend(s, pipeline, &nextIndex, maxIndex) + } + break SEND + case <-s.triggerCh: + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + case <-randomTimeout(r.conf.CommitTimeout): + lastLogIdx, _ := r.getLastLog() + shouldStop = r.pipelineSend(s, pipeline, &nextIndex, lastLogIdx) + } + } + + // Stop our decoder, and wait for it to finish + close(stopCh) + select { + case <-finishCh: + case <-r.shutdownCh: + } + return nil +} + +// pipelineSend is used to send data over a pipeline. +func (r *Raft) pipelineSend(s *followerReplication, p AppendPipeline, nextIdx *uint64, lastIndex uint64) (shouldStop bool) { + // Create a new append request + req := new(AppendEntriesRequest) + if err := r.setupAppendEntries(s, req, *nextIdx, lastIndex); err != nil { + return true + } + + // Pipeline the append entries + if _, err := p.AppendEntries(req, new(AppendEntriesResponse)); err != nil { + r.logger.Printf("[ERR] raft: Failed to pipeline AppendEntries to %v: %v", s.peer, err) + return true + } + + // Increase the next send log to avoid re-sending old logs + if n := len(req.Entries); n > 0 { + last := req.Entries[n-1] + *nextIdx = last.Index + 1 + } + return false +} + +// pipelineDecode is used to decode the responses of pipelined requests. +func (r *Raft) pipelineDecode(s *followerReplication, p AppendPipeline, stopCh, finishCh chan struct{}) { + defer close(finishCh) + respCh := p.Consumer() + for { + select { + case ready := <-respCh: + req, resp := ready.Request(), ready.Response() + appendStats(s.peer, ready.Start(), float32(len(req.Entries))) + + // Check for a newer term, stop running + if resp.Term > req.Term { + r.handleStaleTerm(s) + return + } + + // Update the last contact + s.setLastContact() + + // Abort pipeline if not successful + if !resp.Success { + return + } + + // Update our replication state + updateLastAppended(s, req) + case <-stopCh: + return + } + } +} + +// setupAppendEntries is used to setup an append entries request. +func (r *Raft) setupAppendEntries(s *followerReplication, req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + req.Term = s.currentTerm + req.Leader = r.trans.EncodePeer(r.localAddr) + req.LeaderCommitIndex = r.getCommitIndex() + if err := r.setPreviousLog(req, nextIndex); err != nil { + return err + } + if err := r.setNewLogs(req, nextIndex, lastIndex); err != nil { + return err + } + return nil +} + +// setPreviousLog is used to setup the PrevLogEntry and PrevLogTerm for an +// AppendEntriesRequest given the next index to replicate. +func (r *Raft) setPreviousLog(req *AppendEntriesRequest, nextIndex uint64) error { + // Guard for the first index, since there is no 0 log entry + // Guard against the previous index being a snapshot as well + lastSnapIdx, lastSnapTerm := r.getLastSnapshot() + if nextIndex == 1 { + req.PrevLogEntry = 0 + req.PrevLogTerm = 0 + + } else if (nextIndex - 1) == lastSnapIdx { + req.PrevLogEntry = lastSnapIdx + req.PrevLogTerm = lastSnapTerm + + } else { + var l Log + if err := r.logs.GetLog(nextIndex-1, &l); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", + nextIndex-1, err) + return err + } + + // Set the previous index and term (0 if nextIndex is 1) + req.PrevLogEntry = l.Index + req.PrevLogTerm = l.Term + } + return nil +} + +// setNewLogs is used to setup the logs which should be appended for a request. +func (r *Raft) setNewLogs(req *AppendEntriesRequest, nextIndex, lastIndex uint64) error { + // Append up to MaxAppendEntries or up to the lastIndex + req.Entries = make([]*Log, 0, r.conf.MaxAppendEntries) + maxIndex := min(nextIndex+uint64(r.conf.MaxAppendEntries)-1, lastIndex) + for i := nextIndex; i <= maxIndex; i++ { + oldLog := new(Log) + if err := r.logs.GetLog(i, oldLog); err != nil { + r.logger.Printf("[ERR] raft: Failed to get log at index %d: %v", i, err) + return err + } + req.Entries = append(req.Entries, oldLog) + } + return nil +} + +// appendStats is used to emit stats about an AppendEntries invocation. +func appendStats(peer string, start time.Time, logs float32) { + metrics.MeasureSince([]string{"raft", "replication", "appendEntries", "rpc", peer}, start) + metrics.IncrCounter([]string{"raft", "replication", "appendEntries", "logs", peer}, logs) +} + +// handleStaleTerm is used when a follower indicates that we have a stale term. +func (r *Raft) handleStaleTerm(s *followerReplication) { + r.logger.Printf("[ERR] raft: peer %v has newer term, stopping replication", s.peer) + s.notifyAll(false) // No longer leader + asyncNotifyCh(s.stepDown) +} + +// updateLastAppended is used to update follower replication state after a successful +// AppendEntries RPC. +func updateLastAppended(s *followerReplication, req *AppendEntriesRequest) { + // Mark any inflight logs as committed + if logs := req.Entries; len(logs) > 0 { + first := logs[0] + last := logs[len(logs)-1] + s.inflight.CommitRange(first.Index, last.Index) + + // Update the indexes + s.matchIndex = last.Index + s.nextIndex = last.Index + 1 + } + + // Notify still leader + s.notifyAll(true) +} diff --git a/go/vt/orchestrator/external/raft/snapshot.go b/go/vt/orchestrator/external/raft/snapshot.go new file mode 100644 index 0000000000..a4a17f1cc6 --- /dev/null +++ b/go/vt/orchestrator/external/raft/snapshot.go @@ -0,0 +1,40 @@ +package raft + +import ( + "io" +) + +// SnapshotMeta is for metadata of a snapshot. +type SnapshotMeta struct { + ID string // ID is opaque to the store, and is used for opening + Index uint64 + Term uint64 + Peers []byte + Size int64 +} + +// SnapshotStore interface is used to allow for flexible implementations +// of snapshot storage and retrieval. For example, a client could implement +// a shared state store such as S3, allowing new nodes to restore snapshots +// without streaming from the leader. +type SnapshotStore interface { + // Create is used to begin a snapshot at a given index and term, + // with the current peer set already encoded. + Create(index, term uint64, peers []byte) (SnapshotSink, error) + + // List is used to list the available snapshots in the store. + // It should return then in descending order, with the highest index first. + List() ([]*SnapshotMeta, error) + + // Open takes a snapshot ID and provides a ReadCloser. Once close is + // called it is assumed the snapshot is no longer needed. + Open(id string) (*SnapshotMeta, io.ReadCloser, error) +} + +// SnapshotSink is returned by StartSnapshot. The FSM will Write state +// to the sink and call Close on completion. On error, Cancel will be invoked. +type SnapshotSink interface { + io.WriteCloser + ID() string + Cancel() error +} diff --git a/go/vt/orchestrator/external/raft/stable.go b/go/vt/orchestrator/external/raft/stable.go new file mode 100644 index 0000000000..ff59a8c570 --- /dev/null +++ b/go/vt/orchestrator/external/raft/stable.go @@ -0,0 +1,15 @@ +package raft + +// StableStore is used to provide stable storage +// of key configurations to ensure safety. +type StableStore interface { + Set(key []byte, val []byte) error + + // Get returns the value for key, or an empty byte slice if key was not found. + Get(key []byte) ([]byte, error) + + SetUint64(key []byte, val uint64) error + + // GetUint64 returns the uint64 value for key, or 0 if key was not found. + GetUint64(key []byte) (uint64, error) +} diff --git a/go/vt/orchestrator/external/raft/state.go b/go/vt/orchestrator/external/raft/state.go new file mode 100644 index 0000000000..f6d658b8bb --- /dev/null +++ b/go/vt/orchestrator/external/raft/state.go @@ -0,0 +1,167 @@ +package raft + +import ( + "sync" + "sync/atomic" +) + +// RaftState captures the state of a Raft node: Follower, Candidate, Leader, +// or Shutdown. +type RaftState uint32 + +const ( + // Follower is the initial state of a Raft node. + Follower RaftState = iota + + // Candidate is one of the valid states of a Raft node. + Candidate + + // Leader is one of the valid states of a Raft node. + Leader + + // Shutdown is the terminal state of a Raft node. + Shutdown +) + +func (s RaftState) String() string { + switch s { + case Follower: + return "Follower" + case Candidate: + return "Candidate" + case Leader: + return "Leader" + case Shutdown: + return "Shutdown" + default: + return "Unknown" + } +} + +// raftState is used to maintain various state variables +// and provides an interface to set/get the variables in a +// thread safe manner. +type raftState struct { + // The current term, cache of StableStore + currentTerm uint64 + + // Highest committed log entry + commitIndex uint64 + + // Last applied log to the FSM + lastApplied uint64 + + // protects 4 next fields + lastLock sync.Mutex + + // Cache the latest snapshot index/term + lastSnapshotIndex uint64 + lastSnapshotTerm uint64 + + // Cache the latest log from LogStore + lastLogIndex uint64 + lastLogTerm uint64 + + // Tracks running goroutines + routinesGroup sync.WaitGroup + + // The current state + state RaftState +} + +func (r *raftState) getState() RaftState { + stateAddr := (*uint32)(&r.state) + return RaftState(atomic.LoadUint32(stateAddr)) +} + +func (r *raftState) setState(s RaftState) { + stateAddr := (*uint32)(&r.state) + atomic.StoreUint32(stateAddr, uint32(s)) +} + +func (r *raftState) getCurrentTerm() uint64 { + return atomic.LoadUint64(&r.currentTerm) +} + +func (r *raftState) setCurrentTerm(term uint64) { + atomic.StoreUint64(&r.currentTerm, term) +} + +func (r *raftState) getLastLog() (index, term uint64) { + r.lastLock.Lock() + index = r.lastLogIndex + term = r.lastLogTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastLog(index, term uint64) { + r.lastLock.Lock() + r.lastLogIndex = index + r.lastLogTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getLastSnapshot() (index, term uint64) { + r.lastLock.Lock() + index = r.lastSnapshotIndex + term = r.lastSnapshotTerm + r.lastLock.Unlock() + return +} + +func (r *raftState) setLastSnapshot(index, term uint64) { + r.lastLock.Lock() + r.lastSnapshotIndex = index + r.lastSnapshotTerm = term + r.lastLock.Unlock() +} + +func (r *raftState) getCommitIndex() uint64 { + return atomic.LoadUint64(&r.commitIndex) +} + +func (r *raftState) setCommitIndex(index uint64) { + atomic.StoreUint64(&r.commitIndex, index) +} + +func (r *raftState) getLastApplied() uint64 { + return atomic.LoadUint64(&r.lastApplied) +} + +func (r *raftState) setLastApplied(index uint64) { + atomic.StoreUint64(&r.lastApplied, index) +} + +// Start a goroutine and properly handle the race between a routine +// starting and incrementing, and exiting and decrementing. +func (r *raftState) goFunc(f func()) { + r.routinesGroup.Add(1) + go func() { + defer r.routinesGroup.Done() + f() + }() +} + +func (r *raftState) waitShutdown() { + r.routinesGroup.Wait() +} + +// getLastIndex returns the last index in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastIndex() uint64 { + r.lastLock.Lock() + defer r.lastLock.Unlock() + return max(r.lastLogIndex, r.lastSnapshotIndex) +} + +// getLastEntry returns the last index and term in stable storage. +// Either from the last log or from the last snapshot. +func (r *raftState) getLastEntry() (uint64, uint64) { + r.lastLock.Lock() + defer r.lastLock.Unlock() + if r.lastLogIndex >= r.lastSnapshotIndex { + return r.lastLogIndex, r.lastLogTerm + } + return r.lastSnapshotIndex, r.lastSnapshotTerm +} diff --git a/go/vt/orchestrator/external/raft/tcp_transport.go b/go/vt/orchestrator/external/raft/tcp_transport.go new file mode 100644 index 0000000000..50c6d15df1 --- /dev/null +++ b/go/vt/orchestrator/external/raft/tcp_transport.go @@ -0,0 +1,105 @@ +package raft + +import ( + "errors" + "io" + "log" + "net" + "time" +) + +var ( + errNotAdvertisable = errors.New("local bind address is not advertisable") + errNotTCP = errors.New("local address is not a TCP address") +) + +// TCPStreamLayer implements StreamLayer interface for plain TCP. +type TCPStreamLayer struct { + advertise net.Addr + listener *net.TCPListener +} + +// NewTCPTransport returns a NetworkTransport that is built on top of +// a TCP streaming transport layer. +func NewTCPTransport( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logOutput io.Writer, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransport(stream, maxPool, timeout, logOutput) + }) +} + +// NewTCPTransportWithLogger returns a NetworkTransport that is built on top of +// a TCP streaming transport layer, with log output going to the supplied Logger +func NewTCPTransportWithLogger( + bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + logger *log.Logger, +) (*NetworkTransport, error) { + return newTCPTransport(bindAddr, advertise, maxPool, timeout, func(stream StreamLayer) *NetworkTransport { + return NewNetworkTransportWithLogger(stream, maxPool, timeout, logger) + }) +} + +func newTCPTransport(bindAddr string, + advertise net.Addr, + maxPool int, + timeout time.Duration, + transportCreator func(stream StreamLayer) *NetworkTransport) (*NetworkTransport, error) { + // Try to bind + list, err := net.Listen("tcp", bindAddr) + if err != nil { + return nil, err + } + + // Create stream + stream := &TCPStreamLayer{ + advertise: advertise, + listener: list.(*net.TCPListener), + } + + // Verify that we have a usable advertise address + addr, ok := stream.Addr().(*net.TCPAddr) + if !ok { + list.Close() + return nil, errNotTCP + } + if addr.IP.IsUnspecified() { + list.Close() + return nil, errNotAdvertisable + } + + // Create the network transport + trans := transportCreator(stream) + return trans, nil +} + +// Dial implements the StreamLayer interface. +func (t *TCPStreamLayer) Dial(address string, timeout time.Duration) (net.Conn, error) { + return net.DialTimeout("tcp", address, timeout) +} + +// Accept implements the net.Listener interface. +func (t *TCPStreamLayer) Accept() (c net.Conn, err error) { + return t.listener.Accept() +} + +// Close implements the net.Listener interface. +func (t *TCPStreamLayer) Close() (err error) { + return t.listener.Close() +} + +// Addr implements the net.Listener interface. +func (t *TCPStreamLayer) Addr() net.Addr { + // Use an advertise addr if provided + if t.advertise != nil { + return t.advertise + } + return t.listener.Addr() +} diff --git a/go/vt/orchestrator/external/raft/tcp_transport_test.go b/go/vt/orchestrator/external/raft/tcp_transport_test.go new file mode 100644 index 0000000000..6020a546ca --- /dev/null +++ b/go/vt/orchestrator/external/raft/tcp_transport_test.go @@ -0,0 +1,24 @@ +package raft + +import ( + "net" + "testing" +) + +func TestTCPTransport_BadAddr(t *testing.T) { + _, err := NewTCPTransportWithLogger("0.0.0.0:0", nil, 1, 0, newTestLogger(t)) + if err != errNotAdvertisable { + t.Fatalf("err: %v", err) + } +} + +func TestTCPTransport_WithAdvertise(t *testing.T) { + addr := &net.TCPAddr{IP: []byte{127, 0, 0, 1}, Port: 12345} + trans, err := NewTCPTransportWithLogger("0.0.0.0:0", addr, 1, 0, newTestLogger(t)) + if err != nil { + t.Fatalf("err: %v", err) + } + if trans.LocalAddr() != "127.0.0.1:12345" { + t.Fatalf("bad: %v", trans.LocalAddr()) + } +} diff --git a/go/vt/orchestrator/external/raft/transport.go b/go/vt/orchestrator/external/raft/transport.go new file mode 100644 index 0000000000..2b8b422ff0 --- /dev/null +++ b/go/vt/orchestrator/external/raft/transport.go @@ -0,0 +1,124 @@ +package raft + +import ( + "io" + "time" +) + +// RPCResponse captures both a response and a potential error. +type RPCResponse struct { + Response interface{} + Error error +} + +// RPC has a command, and provides a response mechanism. +type RPC struct { + Command interface{} + Reader io.Reader // Set only for InstallSnapshot + RespChan chan<- RPCResponse +} + +// Respond is used to respond with a response, error or both +func (r *RPC) Respond(resp interface{}, err error) { + r.RespChan <- RPCResponse{resp, err} +} + +// Transport provides an interface for network transports +// to allow Raft to communicate with other nodes. +type Transport interface { + // Consumer returns a channel that can be used to + // consume and respond to RPC requests. + Consumer() <-chan RPC + + // LocalAddr is used to return our local address to distinguish from our peers. + LocalAddr() string + + // AppendEntriesPipeline returns an interface that can be used to pipeline + // AppendEntries requests. + AppendEntriesPipeline(target string) (AppendPipeline, error) + + // AppendEntries sends the appropriate RPC to the target node. + AppendEntries(target string, args *AppendEntriesRequest, resp *AppendEntriesResponse) error + + // RequestVote sends the appropriate RPC to the target node. + RequestVote(target string, args *RequestVoteRequest, resp *RequestVoteResponse) error + + // InstallSnapshot is used to push a snapshot down to a follower. The data is read from + // the ReadCloser and streamed to the client. + InstallSnapshot(target string, args *InstallSnapshotRequest, resp *InstallSnapshotResponse, data io.Reader) error + + // EncodePeer is used to serialize a peer name. + EncodePeer(string) []byte + + // DecodePeer is used to deserialize a peer name. + DecodePeer([]byte) string + + // SetHeartbeatHandler is used to setup a heartbeat handler + // as a fast-pass. This is to avoid head-of-line blocking from + // disk IO. If a Transport does not support this, it can simply + // ignore the call, and push the heartbeat onto the Consumer channel. + SetHeartbeatHandler(cb func(rpc RPC)) +} + +// WithClose is an interface that a transport may provide which +// allows a transport to be shut down cleanly when a Raft instance +// shuts down. +// +// It is defined separately from Transport as unfortunately it wasn't in the +// original interface specification. +type WithClose interface { + // Close permanently closes a transport, stopping + // any associated goroutines and freeing other resources. + Close() error +} + +// LoopbackTransport is an interface that provides a loopback transport suitable for testing +// e.g. InmemTransport. It's there so we don't have to rewrite tests. +type LoopbackTransport interface { + Transport // Embedded transport reference + WithPeers // Embedded peer management + WithClose // with a close routine +} + +// WithPeers is an interface that a transport may provide which allows for connection and +// disconnection. Unless the transport is a loopback transport, the transport specified to +// "Connect" is likely to be nil. +type WithPeers interface { + Connect(peer string, t Transport) // Connect a peer + Disconnect(peer string) // Disconnect a given peer + DisconnectAll() // Disconnect all peers, possibly to reconnect them later +} + +// AppendPipeline is used for pipelining AppendEntries requests. It is used +// to increase the replication throughput by masking latency and better +// utilizing bandwidth. +type AppendPipeline interface { + // AppendEntries is used to add another request to the pipeline. + // The send may block which is an effective form of back-pressure. + AppendEntries(args *AppendEntriesRequest, resp *AppendEntriesResponse) (AppendFuture, error) + + // Consumer returns a channel that can be used to consume + // response futures when they are ready. + Consumer() <-chan AppendFuture + + // Close closes the pipeline and cancels all inflight RPCs + Close() error +} + +// AppendFuture is used to return information about a pipelined AppendEntries request. +type AppendFuture interface { + Future + + // Start returns the time that the append request was started. + // It is always OK to call this method. + Start() time.Time + + // Request holds the parameters of the AppendEntries call. + // It is always OK to call this method. + Request() *AppendEntriesRequest + + // Response holds the results of the AppendEntries call. + // This method must only be called after the Error + // method returns, and will only be valid on success. + Response() *AppendEntriesResponse +} diff --git a/go/vt/orchestrator/external/raft/transport_test.go b/go/vt/orchestrator/external/raft/transport_test.go new file mode 100644 index 0000000000..b89c3c90dd --- /dev/null +++ b/go/vt/orchestrator/external/raft/transport_test.go @@ -0,0 +1,313 @@ +package raft + +import ( + "bytes" + "reflect" + "testing" + "time" +) + +const ( + TT_Inmem = iota + + // NOTE: must be last + numTestTransports +) + +func NewTestTransport(ttype int, addr string) (string, LoopbackTransport) { + switch ttype { + case TT_Inmem: + addr, lt := NewInmemTransport(addr) + return addr, lt + default: + panic("Unknown transport type") + } +} + +func TestTransport_StartStop(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + _, trans := NewTestTransport(ttype, "") + if err := trans.Close(); err != nil { + t.Fatalf("err: %v", err) + } + } +} + +func TestTransport_AppendEntries(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + { + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + var out AppendEntriesResponse + if err := trans2.AppendEntries(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_AppendEntriesPipeline(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := AppendEntriesRequest{ + Term: 10, + Leader: []byte("cartman"), + PrevLogEntry: 100, + PrevLogTerm: 4, + Entries: []*Log{ + { + Index: 101, + Term: 4, + Type: LogNoop, + }, + }, + LeaderCommitIndex: 90, + } + resp := AppendEntriesResponse{ + Term: 4, + LastLog: 90, + Success: true, + } + + // Listen for a request + go func() { + for i := 0; i < 10; i++ { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*AppendEntriesRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + pipeline, err := trans2.AppendEntriesPipeline(trans1.LocalAddr()) + if err != nil { + t.Fatalf("err: %v", err) + } + defer pipeline.Close() + for i := 0; i < 10; i++ { + out := new(AppendEntriesResponse) + if _, err := pipeline.AppendEntries(&args, out); err != nil { + t.Fatalf("err: %v", err) + } + } + + respCh := pipeline.Consumer() + for i := 0; i < 10; i++ { + select { + case ready := <-respCh: + // Verify the response + if !reflect.DeepEqual(&resp, ready.Response()) { + t.Fatalf("command mismatch: %#v %#v", &resp, ready.Response()) + } + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + } + } +} + +func TestTransport_RequestVote(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := RequestVoteRequest{ + Term: 20, + Candidate: []byte("butters"), + LastLogIndex: 100, + LastLogTerm: 19, + } + resp := RequestVoteResponse{ + Term: 100, + Peers: []byte("blah"), + Granted: false, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*RequestVoteRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + var out RequestVoteResponse + if err := trans2.RequestVote(trans1.LocalAddr(), &args, &out); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_InstallSnapshot(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + addr1, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + rpcCh := trans1.Consumer() + + // Make the RPC request + args := InstallSnapshotRequest{ + Term: 10, + Leader: []byte("kyle"), + LastLogIndex: 100, + LastLogTerm: 9, + Peers: []byte("blah blah"), + Size: 10, + } + resp := InstallSnapshotResponse{ + Term: 10, + Success: true, + } + + // Listen for a request + go func() { + select { + case rpc := <-rpcCh: + // Verify the command + req := rpc.Command.(*InstallSnapshotRequest) + if !reflect.DeepEqual(req, &args) { + t.Fatalf("command mismatch: %#v %#v", *req, args) + } + + // Try to read the bytes + buf := make([]byte, 10) + rpc.Reader.Read(buf) + + // Compare + if bytes.Compare(buf, []byte("0123456789")) != 0 { + t.Fatalf("bad buf %v", buf) + } + + rpc.Respond(&resp, nil) + + case <-time.After(200 * time.Millisecond): + t.Fatalf("timeout") + } + }() + + // Transport 2 makes outbound request + addr2, trans2 := NewTestTransport(ttype, "") + defer trans2.Close() + + trans1.Connect(addr2, trans2) + trans2.Connect(addr1, trans1) + + // Create a buffer + buf := bytes.NewBuffer([]byte("0123456789")) + + var out InstallSnapshotResponse + if err := trans2.InstallSnapshot(trans1.LocalAddr(), &args, &out, buf); err != nil { + t.Fatalf("err: %v", err) + } + + // Verify the response + if !reflect.DeepEqual(resp, out) { + t.Fatalf("command mismatch: %#v %#v", resp, out) + } + } +} + +func TestTransport_EncodeDecode(t *testing.T) { + for ttype := 0; ttype < numTestTransports; ttype++ { + _, trans1 := NewTestTransport(ttype, "") + defer trans1.Close() + + local := trans1.LocalAddr() + enc := trans1.EncodePeer(local) + dec := trans1.DecodePeer(enc) + + if dec != local { + t.Fatalf("enc/dec fail: %v %v", dec, local) + } + } +} diff --git a/go/vt/orchestrator/external/raft/util.go b/go/vt/orchestrator/external/raft/util.go new file mode 100644 index 0000000000..944968a25c --- /dev/null +++ b/go/vt/orchestrator/external/raft/util.go @@ -0,0 +1,179 @@ +package raft + +import ( + "bytes" + crand "crypto/rand" + "fmt" + "math" + "math/big" + "math/rand" + "time" + + "github.com/hashicorp/go-msgpack/codec" +) + +func init() { + // Ensure we use a high-entropy seed for the psuedo-random generator + rand.Seed(newSeed()) +} + +// returns an int64 from a crypto random source +// can be used to seed a source for a math/rand. +func newSeed() int64 { + r, err := crand.Int(crand.Reader, big.NewInt(math.MaxInt64)) + if err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + return r.Int64() +} + +// randomTimeout returns a value that is between the minVal and 2x minVal. +func randomTimeout(minVal time.Duration) <-chan time.Time { + if minVal == 0 { + return nil + } + extra := (time.Duration(rand.Int63()) % minVal) + return time.After(minVal + extra) +} + +// min returns the minimum. +func min(a, b uint64) uint64 { + if a <= b { + return a + } + return b +} + +// max returns the maximum. +func max(a, b uint64) uint64 { + if a >= b { + return a + } + return b +} + +// generateUUID is used to generate a random UUID. +func generateUUID() string { + buf := make([]byte, 16) + if _, err := crand.Read(buf); err != nil { + panic(fmt.Errorf("failed to read random bytes: %v", err)) + } + + return fmt.Sprintf("%08x-%04x-%04x-%04x-%12x", + buf[0:4], + buf[4:6], + buf[6:8], + buf[8:10], + buf[10:16]) +} + +// asyncNotifyCh is used to do an async channel send +// to a single channel without blocking. +func asyncNotifyCh(ch chan struct{}) { + select { + case ch <- struct{}{}: + default: + } +} + +// asyncNotifyBool is used to do an async notification +// on a bool channel. +func asyncNotifyBool(ch chan bool, v bool) { + select { + case ch <- v: + default: + } +} + +// ExcludePeer is used to exclude a single peer from a list of peers. +func ExcludePeer(peers []string, peer string) []string { + otherPeers := make([]string, 0, len(peers)) + for _, p := range peers { + if p != peer { + otherPeers = append(otherPeers, p) + } + } + return otherPeers +} + +// PeerContained checks if a given peer is contained in a list. +func PeerContained(peers []string, peer string) bool { + for _, p := range peers { + if p == peer { + return true + } + } + return false +} + +// AddUniquePeer is used to add a peer to a list of existing +// peers only if it is not already contained. +func AddUniquePeer(peers []string, peer string) []string { + if PeerContained(peers, peer) { + return peers + } + return append(peers, peer) +} + +// encodePeers is used to serialize a list of peers. +func encodePeers(peers []string, trans Transport) []byte { + // Encode each peer + var encPeers [][]byte + for _, p := range peers { + encPeers = append(encPeers, trans.EncodePeer(p)) + } + + // Encode the entire array + buf, err := encodeMsgPack(encPeers) + if err != nil { + panic(fmt.Errorf("failed to encode peers: %v", err)) + } + + return buf.Bytes() +} + +// decodePeers is used to deserialize a list of peers. +func decodePeers(buf []byte, trans Transport) []string { + // Decode the buffer first + var encPeers [][]byte + if err := decodeMsgPack(buf, &encPeers); err != nil { + panic(fmt.Errorf("failed to decode peers: %v", err)) + } + + // Deserialize each peer + var peers []string + for _, enc := range encPeers { + peers = append(peers, trans.DecodePeer(enc)) + } + + return peers +} + +// Decode reverses the encode operation on a byte slice input. +func decodeMsgPack(buf []byte, out interface{}) error { + r := bytes.NewBuffer(buf) + hd := codec.MsgpackHandle{} + dec := codec.NewDecoder(r, &hd) + return dec.Decode(out) +} + +// Encode writes an encoded object to a new bytes buffer. +func encodeMsgPack(in interface{}) (*bytes.Buffer, error) { + buf := bytes.NewBuffer(nil) + hd := codec.MsgpackHandle{} + enc := codec.NewEncoder(buf, &hd) + err := enc.Encode(in) + return buf, err +} + +// backoff is used to compute an exponential backoff +// duration. Base time is scaled by the current round, +// up to some maximum scale factor. +func backoff(base time.Duration, round, limit uint64) time.Duration { + power := min(round, limit) + for power > 2 { + base *= 2 + power-- + } + return base +} diff --git a/go/vt/orchestrator/external/raft/util_test.go b/go/vt/orchestrator/external/raft/util_test.go new file mode 100644 index 0000000000..88b93211f4 --- /dev/null +++ b/go/vt/orchestrator/external/raft/util_test.go @@ -0,0 +1,152 @@ +package raft + +import ( + "reflect" + "regexp" + "testing" + "time" +) + +func TestRandomTimeout(t *testing.T) { + start := time.Now() + timeout := randomTimeout(time.Millisecond) + + select { + case <-timeout: + diff := time.Now().Sub(start) + if diff < time.Millisecond { + t.Fatalf("fired early") + } + case <-time.After(3 * time.Millisecond): + t.Fatalf("timeout") + } +} + +func TestNewSeed(t *testing.T) { + vals := make(map[int64]bool) + for i := 0; i < 1000; i++ { + seed := newSeed() + if _, exists := vals[seed]; exists { + t.Fatal("newSeed() return a value it'd previously returned") + } + vals[seed] = true + } +} + +func TestRandomTimeout_NoTime(t *testing.T) { + timeout := randomTimeout(0) + if timeout != nil { + t.Fatalf("expected nil channel") + } +} + +func TestMin(t *testing.T) { + if min(1, 1) != 1 { + t.Fatalf("bad min") + } + if min(2, 1) != 1 { + t.Fatalf("bad min") + } + if min(1, 2) != 1 { + t.Fatalf("bad min") + } +} + +func TestMax(t *testing.T) { + if max(1, 1) != 1 { + t.Fatalf("bad max") + } + if max(2, 1) != 2 { + t.Fatalf("bad max") + } + if max(1, 2) != 2 { + t.Fatalf("bad max") + } +} + +func TestGenerateUUID(t *testing.T) { + prev := generateUUID() + for i := 0; i < 100; i++ { + id := generateUUID() + if prev == id { + t.Fatalf("Should get a new ID!") + } + + matched, err := regexp.MatchString( + `[\da-f]{8}-[\da-f]{4}-[\da-f]{4}-[\da-f]{4}-[\da-f]{12}`, id) + if !matched || err != nil { + t.Fatalf("expected match %s %v %s", id, matched, err) + } + } +} + +func TestExcludePeer(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + peer := peers[2] + + after := ExcludePeer(peers, peer) + if len(after) != 2 { + t.Fatalf("Bad length") + } + if after[0] == peer || after[1] == peer { + t.Fatalf("should not contain peer") + } +} + +func TestPeerContained(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + + if !PeerContained(peers, peers[2]) { + t.Fatalf("Expect contained") + } + if PeerContained(peers, NewInmemAddr()) { + t.Fatalf("unexpected contained") + } +} + +func TestAddUniquePeer(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + after := AddUniquePeer(peers, peers[2]) + if !reflect.DeepEqual(after, peers) { + t.Fatalf("unexpected append") + } + after = AddUniquePeer(peers, NewInmemAddr()) + if len(after) != 4 { + t.Fatalf("expected append") + } +} + +func TestEncodeDecodePeers(t *testing.T) { + peers := []string{NewInmemAddr(), NewInmemAddr(), NewInmemAddr()} + _, trans := NewInmemTransport("") + + // Try to encode/decode + buf := encodePeers(peers, trans) + decoded := decodePeers(buf, trans) + + if !reflect.DeepEqual(peers, decoded) { + t.Fatalf("mismatch %v %v", peers, decoded) + } +} + +func TestBackoff(t *testing.T) { + b := backoff(10*time.Millisecond, 1, 8) + if b != 10*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(20*time.Millisecond, 2, 8) + if b != 20*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(10*time.Millisecond, 8, 8) + if b != 640*time.Millisecond { + t.Fatalf("bad: %v", b) + } + + b = backoff(10*time.Millisecond, 9, 8) + if b != 640*time.Millisecond { + t.Fatalf("bad: %v", b) + } +} diff --git a/go/vt/orchestrator/external/zk/zk.go b/go/vt/orchestrator/external/zk/zk.go new file mode 100644 index 0000000000..894df772c5 --- /dev/null +++ b/go/vt/orchestrator/external/zk/zk.go @@ -0,0 +1,404 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// zk provides with higher level commands over the lower level zookeeper connector +package zk + +import ( + "bytes" + "errors" + "fmt" + "math" + gopath "path" + "sort" + "strconv" + "strings" + "time" + + "github.com/samuel/go-zookeeper/zk" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +type ZooKeeper struct { + servers []string + authScheme string + authExpression []byte + + // We assume complete access to all + flags int32 + acl []zk.ACL +} + +func NewZooKeeper() *ZooKeeper { + return &ZooKeeper{ + flags: int32(0), + acl: zk.WorldACL(zk.PermAll), + } +} + +// SetServers sets the list of servers for the zookeeper client to connect to. +// Each element in the array should be in either of following forms: +// - "servername" +// - "servername:port" +func (zook *ZooKeeper) SetServers(serversArray []string) { + zook.servers = serversArray +} + +func (zook *ZooKeeper) SetAuth(scheme string, auth []byte) { + log.Debug("Setting Auth ") + zook.authScheme = scheme + zook.authExpression = auth +} + +// Returns acls +func (zook *ZooKeeper) BuildACL(authScheme string, user string, pwd string, acls string) (perms []zk.ACL, err error) { + aclsList := strings.Split(acls, ",") + for _, elem := range aclsList { + acl, err := strconv.ParseInt(elem, 10, 32) + if err != nil { + break + } + perm := zk.DigestACL(int32(acl), user, pwd) + perms = append(perms, perm[0]) + } + return perms, err +} + +type infoLogger struct{} + +func (_ infoLogger) Printf(format string, a ...interface{}) { + log.Infof(format, a...) +} + +// connect +func (zook *ZooKeeper) connect() (*zk.Conn, error) { + zk.DefaultLogger = &infoLogger{} + conn, _, err := zk.Connect(zook.servers, time.Second) + if err == nil && zook.authScheme != "" { + log.Debugf("Add Auth %s %s", zook.authScheme, zook.authExpression) + err = conn.AddAuth(zook.authScheme, zook.authExpression) + } + + return conn, err +} + +// Exists returns true when the given path exists +func (zook *ZooKeeper) Exists(path string) (bool, error) { + connection, err := zook.connect() + if err != nil { + return false, err + } + defer connection.Close() + + exists, _, err := connection.Exists(path) + return exists, err +} + +// Get returns value associated with given path, or error if path does not exist +func (zook *ZooKeeper) Get(path string) ([]byte, error) { + connection, err := zook.connect() + if err != nil { + return []byte{}, err + } + defer connection.Close() + + data, _, err := connection.Get(path) + return data, err +} + +func (zook *ZooKeeper) GetACL(path string) (data []string, err error) { + connection, err := zook.connect() + if err != nil { + return nil, err + } + defer connection.Close() + + perms, _, err := connection.GetACL(path) + return zook.aclsToString(perms), err +} + +func (zook *ZooKeeper) aclsToString(acls []zk.ACL) (result []string) { + for _, acl := range acls { + var buffer bytes.Buffer + + buffer.WriteString(fmt.Sprintf("%v:%v:", acl.Scheme, acl.ID)) + + if acl.Perms&zk.PermCreate != 0 { + buffer.WriteString("c") + } + if acl.Perms&zk.PermDelete != 0 { + buffer.WriteString("d") + } + if acl.Perms&zk.PermRead != 0 { + buffer.WriteString("r") + } + if acl.Perms&zk.PermWrite != 0 { + buffer.WriteString("w") + } + if acl.Perms&zk.PermAdmin != 0 { + buffer.WriteString("a") + } + result = append(result, buffer.String()) + } + return result +} + +// Children returns sub-paths of given path, optionally empty array, or error if path does not exist +func (zook *ZooKeeper) Children(path string) ([]string, error) { + connection, err := zook.connect() + if err != nil { + return []string{}, err + } + defer connection.Close() + + children, _, err := connection.Children(path) + return children, err +} + +// childrenRecursiveInternal: internal implementation of recursive-children query. +func (zook *ZooKeeper) childrenRecursiveInternal(connection *zk.Conn, path string, incrementalPath string) ([]string, error) { + children, _, err := connection.Children(path) + if err != nil { + return children, err + } + sort.Sort(sort.StringSlice(children)) + recursiveChildren := []string{} + for _, child := range children { + incrementalChild := gopath.Join(incrementalPath, child) + recursiveChildren = append(recursiveChildren, incrementalChild) + log.Debugf("incremental child: %+v", incrementalChild) + incrementalChildren, err := zook.childrenRecursiveInternal(connection, gopath.Join(path, child), incrementalChild) + if err != nil { + return children, err + } + recursiveChildren = append(recursiveChildren, incrementalChildren...) + } + return recursiveChildren, err +} + +// ChildrenRecursive returns list of all descendants of given path (optionally empty), or error if the path +// does not exist. +// Every element in result list is a relative subpath for the given path. +func (zook *ZooKeeper) ChildrenRecursive(path string) ([]string, error) { + connection, err := zook.connect() + if err != nil { + return []string{}, err + } + defer connection.Close() + + result, err := zook.childrenRecursiveInternal(connection, path, "") + return result, err +} + +// createInternal: create a new path +func (zook *ZooKeeper) createInternal(connection *zk.Conn, path string, data []byte, acl []zk.ACL, force bool) (string, error) { + if path == "/" { + return "/", nil + } + + log.Debugf("creating: %s", path) + attempts := 0 + for { + attempts += 1 + returnValue, err := connection.Create(path, data, zook.flags, zook.acl) + log.Debugf("create status for %s: %s, %+v", path, returnValue, err) + + if err != nil && force && attempts < 2 { + parentPath := gopath.Dir(path) + if parentPath == path { + return returnValue, err + } + returnValue, err = zook.createInternal(connection, parentPath, []byte("zookeepercli auto-generated"), acl, force) + } else { + return returnValue, err + } + } + return "", nil +} + +// createInternalWithACL: create a new path with acl +func (zook *ZooKeeper) createInternalWithACL(connection *zk.Conn, path string, data []byte, force bool, perms []zk.ACL) (string, error) { + if path == "/" { + return "/", nil + } + log.Debugf("creating: %s with acl ", path) + attempts := 0 + for { + attempts += 1 + returnValue, err := connection.Create(path, data, zook.flags, perms) + log.Debugf("create status for %s: %s, %+v", path, returnValue, err) + if err != nil && force && attempts < 2 { + returnValue, err = zook.createInternalWithACL(connection, gopath.Dir(path), []byte("zookeepercli auto-generated"), force, perms) + } else { + return returnValue, err + } + } + return "", nil +} + +// Create will create a new path, or exit with error should the path exist. +// The "force" param controls the behavior when path's parent directory does not exist. +// When "force" is false, the function returns with error/ When "force" is true, it recursively +// attempts to create required parent directories. +func (zook *ZooKeeper) Create(path string, data []byte, aclstr string, force bool) (string, error) { + connection, err := zook.connect() + if err != nil { + return "", err + } + defer connection.Close() + + if len(aclstr) > 0 { + zook.acl, err = zook.parseACLString(aclstr) + if err != nil { + return "", err + } + } + + return zook.createInternal(connection, path, data, zook.acl, force) +} + +func (zook *ZooKeeper) CreateWithACL(path string, data []byte, force bool, perms []zk.ACL) (string, error) { + connection, err := zook.connect() + if err != nil { + return "", err + } + defer connection.Close() + + return zook.createInternalWithACL(connection, path, data, force, perms) +} + +// Set updates a value for a given path, or returns with error if the path does not exist +func (zook *ZooKeeper) Set(path string, data []byte) (*zk.Stat, error) { + connection, err := zook.connect() + if err != nil { + return nil, err + } + defer connection.Close() + + return connection.Set(path, data, -1) +} + +// updates the ACL on a given path +func (zook *ZooKeeper) SetACL(path string, aclstr string, force bool) (string, error) { + connection, err := zook.connect() + if err != nil { + return "", err + } + defer connection.Close() + + acl, err := zook.parseACLString(aclstr) + if err != nil { + return "", err + } + + if force { + exists, _, err := connection.Exists(path) + if err != nil { + return "", err + } + + if !exists { + return zook.createInternal(connection, path, []byte(""), acl, force) + } + } + + _, err = connection.SetACL(path, acl, -1) + return path, err +} + +func (zook *ZooKeeper) parseACLString(aclstr string) (acl []zk.ACL, err error) { + aclsList := strings.Split(aclstr, ",") + for _, entry := range aclsList { + parts := strings.Split(entry, ":") + var scheme, id string + var perms int32 + if len(parts) > 3 && parts[0] == "digest" { + scheme = parts[0] + id = fmt.Sprintf("%s:%s", parts[1], parts[2]) + perms, err = zook.parsePermsString(parts[3]) + } else { + scheme, id = parts[0], parts[1] + perms, err = zook.parsePermsString(parts[2]) + } + + if err == nil { + perm := zk.ACL{Scheme: scheme, ID: id, Perms: perms} + acl = append(acl, perm) + } + } + return acl, err +} + +func (zook *ZooKeeper) parsePermsString(permstr string) (perms int32, err error) { + if x, e := strconv.ParseFloat(permstr, 64); e == nil { + perms = int32(math.Min(x, 31)) + } else { + for _, rune := range strings.Split(permstr, "") { + switch rune { + case "r": + perms |= zk.PermRead + break + case "w": + perms |= zk.PermWrite + break + case "c": + perms |= zk.PermCreate + break + case "d": + perms |= zk.PermDelete + break + case "a": + perms |= zk.PermAdmin + break + default: + err = errors.New("invalid ACL string specified") + } + + if err != nil { + break + } + } + } + return perms, err +} + +// Delete removes a path entry. It exits with error if the path does not exist, or has subdirectories. +func (zook *ZooKeeper) Delete(path string) error { + connection, err := zook.connect() + if err != nil { + return err + } + defer connection.Close() + + return connection.Delete(path, -1) +} + +// Delete recursive if has subdirectories. +func (zook *ZooKeeper) DeleteRecursive(path string) error { + result, err := zook.ChildrenRecursive(path) + if err != nil { + log.Fatale(err) + } + + for i := len(result) - 1; i >= 0; i-- { + znode := path + "/" + result[i] + if err = zook.Delete(znode); err != nil { + log.Fatale(err) + } + } + + return zook.Delete(path) +} diff --git a/go/vt/orchestrator/external/zk/zk_test.go b/go/vt/orchestrator/external/zk/zk_test.go new file mode 100644 index 0000000000..1be602ca03 --- /dev/null +++ b/go/vt/orchestrator/external/zk/zk_test.go @@ -0,0 +1,76 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +// zk provides with higher level commands over the lower level zookeeper connector +package zk + +import ( + "testing" + + "github.com/samuel/go-zookeeper/zk" +) + +func TestParseACLString(t *testing.T) { + cases := []struct { + aclstr string + want []zk.ACL + }{ + {"world:anyone:cdrwa", []zk.ACL{{Scheme: "world", ID: "anyone", Perms: 31}}}, + {"world:anyone:rw", []zk.ACL{{Scheme: "world", ID: "anyone", Perms: 3}}}, + {"world:anyone:3", []zk.ACL{{Scheme: "world", ID: "anyone", Perms: 3}}}, + {"host:example.com:cdrw", []zk.ACL{{Scheme: "host", ID: "example.com", Perms: 15}}}, + {"ip:10.2.1.15/32:cdrwa", []zk.ACL{{Scheme: "ip", ID: "10.2.1.15/32", Perms: 31}}}, + {"digest:username:pwhash:cd", []zk.ACL{{Scheme: "digest", ID: "username:pwhash", Perms: 12}}}, + {"auth::cdrwa", []zk.ACL{{Scheme: "auth", ID: "", Perms: 31}}}, + } + + for _, c := range cases { + zook := NewZooKeeper() + got, _ := zook.parseACLString(c.aclstr) + if !aclsEqual(got, c.want) { + t.Errorf("parseACLString(%q) == %q, want %q", c.aclstr, got, c.want) + } + } +} + +func TestParseInvalidACLString(t *testing.T) { + aclstr := "world:anyone:rwb" + want := "invalid ACL string specified" + + zook := NewZooKeeper() + _, err := zook.parseACLString(aclstr) + + if err == nil { + t.Error("No error returned") + } else { + if err.Error() != want { + t.Errorf("parseACLString(%q) error %q, want %q", aclstr, err.Error(), want) + } + } +} + +func aclsEqual(a, b []zk.ACL) bool { + if len(a) != len(b) { + return false + } + + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} diff --git a/go/vt/orchestrator/http/agents_api.go b/go/vt/orchestrator/http/agents_api.go new file mode 100644 index 0000000000..b62ed1fe70 --- /dev/null +++ b/go/vt/orchestrator/http/agents_api.go @@ -0,0 +1,133 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package http + +import ( + "fmt" + "net/http" + "strconv" + "strings" + + "github.com/go-martini/martini" + "github.com/martini-contrib/render" + + "vitess.io/vitess/go/vt/orchestrator/agent" + "vitess.io/vitess/go/vt/orchestrator/attributes" +) + +type HttpAgentsAPI struct { + URLPrefix string +} + +var AgentsAPI HttpAgentsAPI = HttpAgentsAPI{} + +// SubmitAgent registeres an agent. It is initiated by an agent to register itself. +func (this *HttpAgentsAPI) SubmitAgent(params martini.Params, r render.Render) { + port, err := strconv.Atoi(params["port"]) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + output, err := agent.SubmitAgent(params["host"], port, params["token"]) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + r.JSON(200, output) +} + +// SetHostAttribute is a utility method that allows per-host key-value store. +func (this *HttpAgentsAPI) SetHostAttribute(params martini.Params, r render.Render, req *http.Request) { + err := attributes.SetHostAttributes(params["host"], params["attrVame"], params["attrValue"]) + + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(200, (err == nil)) +} + +// GetHostAttributeByAttributeName returns a host attribute +func (this *HttpAgentsAPI) GetHostAttributeByAttributeName(params martini.Params, r render.Render, req *http.Request) { + + output, err := attributes.GetHostAttributesByAttribute(params["attr"], req.URL.Query().Get("valueMatch")) + + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(200, output) +} + +// AgentsHosts provides list of agent host names +func (this *HttpAgentsAPI) AgentsHosts(params martini.Params, r render.Render, req *http.Request) string { + agents, err := agent.ReadAgents() + hostnames := []string{} + for _, agent := range agents { + hostnames = append(hostnames, agent.Hostname) + } + + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return "" + } + + if req.URL.Query().Get("format") == "txt" { + return strings.Join(hostnames, "\n") + } else { + r.JSON(200, hostnames) + } + return "" +} + +// AgentsInstances provides list of assumed MySQL instances (host:port) +func (this *HttpAgentsAPI) AgentsInstances(params martini.Params, r render.Render, req *http.Request) string { + agents, err := agent.ReadAgents() + hostnames := []string{} + for _, agent := range agents { + hostnames = append(hostnames, fmt.Sprintf("%s:%d", agent.Hostname, agent.MySQLPort)) + } + + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return "" + } + + if req.URL.Query().Get("format") == "txt" { + return strings.Join(hostnames, "\n") + } else { + r.JSON(200, hostnames) + } + return "" +} + +func (this *HttpAgentsAPI) AgentPing(params martini.Params, r render.Render, req *http.Request) { + r.JSON(200, "OK") +} + +// RegisterRequests makes for the de-facto list of known API calls +func (this *HttpAgentsAPI) RegisterRequests(m *martini.ClassicMartini) { + m.Get(this.URLPrefix+"/api/submit-agent/:host/:port/:token", this.SubmitAgent) + m.Get(this.URLPrefix+"/api/host-attribute/:host/:attrVame/:attrValue", this.SetHostAttribute) + m.Get(this.URLPrefix+"/api/host-attribute/attr/:attr/", this.GetHostAttributeByAttributeName) + m.Get(this.URLPrefix+"/api/agents-hosts", this.AgentsHosts) + m.Get(this.URLPrefix+"/api/agents-instances", this.AgentsInstances) + m.Get(this.URLPrefix+"/api/agent-ping", this.AgentPing) +} diff --git a/go/vt/orchestrator/http/api.go b/go/vt/orchestrator/http/api.go new file mode 100644 index 0000000000..4e43bebb6d --- /dev/null +++ b/go/vt/orchestrator/http/api.go @@ -0,0 +1,3938 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package http + +import ( + "encoding/json" + "fmt" + "net" + "net/http" + "strconv" + "strings" + "time" + + "github.com/go-martini/martini" + "github.com/martini-contrib/auth" + "github.com/martini-contrib/render" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/util" + + "vitess.io/vitess/go/vt/orchestrator/agent" + "vitess.io/vitess/go/vt/orchestrator/collection" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/discovery" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/logic" + "vitess.io/vitess/go/vt/orchestrator/metrics/query" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" +) + +// APIResponseCode is an OK/ERROR response code +type APIResponseCode int + +const ( + ERROR APIResponseCode = iota + OK +) + +var apiSynonyms = map[string]string{ + "relocate-slaves": "relocate-replicas", + "regroup-slaves": "regroup-replicas", + "move-up-slaves": "move-up-replicas", + "repoint-slaves": "repoint-replicas", + "enslave-siblings": "take-siblings", + "enslave-master": "take-master", + "regroup-slaves-bls": "regroup-replicas-bls", + "move-slaves-gtid": "move-replicas-gtid", + "regroup-slaves-gtid": "regroup-replicas-gtid", + "match-slaves": "match-replicas", + "match-up-slaves": "match-up-replicas", + "regroup-slaves-pgtid": "regroup-replicas-pgtid", + "detach-slave": "detach-replica", + "reattach-slave": "reattach-replica", + "detach-slave-master-host": "detach-replica-master-host", + "reattach-slave-master-host": "reattach-replica-master-host", + "cluster-osc-slaves": "cluster-osc-replicas", + "start-slave": "start-replica", + "restart-slave": "restart-replica", + "stop-slave": "stop-replica", + "stop-slave-nice": "stop-replica-nice", + "reset-slave": "reset-replica", + "restart-slave-statements": "restart-replica-statements", +} + +var registeredPaths = []string{} +var emptyInstanceKey inst.InstanceKey + +func (this *APIResponseCode) MarshalJSON() ([]byte, error) { + return json.Marshal(this.String()) +} + +func (this *APIResponseCode) String() string { + switch *this { + case ERROR: + return "ERROR" + case OK: + return "OK" + } + return "unknown" +} + +// HttpStatus returns the respective HTTP status for this response +func (this *APIResponseCode) HttpStatus() int { + switch *this { + case ERROR: + return http.StatusInternalServerError + case OK: + return http.StatusOK + } + return http.StatusNotImplemented +} + +// APIResponse is a response returned as JSON to various requests. +type APIResponse struct { + Code APIResponseCode + Message string + Details interface{} +} + +func Respond(r render.Render, apiResponse *APIResponse) { + r.JSON(apiResponse.Code.HttpStatus(), apiResponse) +} + +type HttpAPI struct { + URLPrefix string +} + +var API HttpAPI = HttpAPI{} +var discoveryMetrics = collection.CreateOrReturnCollection("DISCOVERY_METRICS") +var queryMetrics = collection.CreateOrReturnCollection("BACKEND_WRITES") +var writeBufferMetrics = collection.CreateOrReturnCollection("WRITE_BUFFER") + +func (this *HttpAPI) getInstanceKeyInternal(host string, port string, resolve bool) (inst.InstanceKey, error) { + var instanceKey *inst.InstanceKey + var err error + if resolve { + instanceKey, err = inst.NewResolveInstanceKeyStrings(host, port) + } else { + instanceKey, err = inst.NewRawInstanceKeyStrings(host, port) + } + if err != nil { + return emptyInstanceKey, err + } + instanceKey, err = inst.FigureInstanceKey(instanceKey, nil) + if err != nil { + return emptyInstanceKey, err + } + if instanceKey == nil { + return emptyInstanceKey, fmt.Errorf("Unexpected nil instanceKey in getInstanceKeyInternal(%+v, %+v, %+v)", host, port, resolve) + } + return *instanceKey, nil +} + +func (this *HttpAPI) getInstanceKey(host string, port string) (inst.InstanceKey, error) { + return this.getInstanceKeyInternal(host, port, true) +} + +func (this *HttpAPI) getNoResolveInstanceKey(host string, port string) (inst.InstanceKey, error) { + return this.getInstanceKeyInternal(host, port, false) +} + +func getTag(params martini.Params, req *http.Request) (tag *inst.Tag, err error) { + tagString := req.URL.Query().Get("tag") + if tagString != "" { + return inst.ParseTag(tagString) + } + return inst.NewTag(params["tagName"], params["tagValue"]) +} + +func (this *HttpAPI) getBinlogCoordinates(logFile string, logPos string) (inst.BinlogCoordinates, error) { + coordinates := inst.BinlogCoordinates{LogFile: logFile} + var err error + if coordinates.LogPos, err = strconv.ParseInt(logPos, 10, 0); err != nil { + return coordinates, fmt.Errorf("Invalid logPos: %s", logPos) + } + + return coordinates, err +} + +// InstanceReplicas lists all replicas of given instance +func (this *HttpAPI) InstanceReplicas(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + replicas, err := inst.ReadReplicaInstances(&instanceKey) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + r.JSON(http.StatusOK, replicas) +} + +// Instance reads and returns an instance's details. +func (this *HttpAPI) Instance(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + r.JSON(http.StatusOK, instance) +} + +// AsyncDiscover issues an asynchronous read on an instance. This is +// useful for bulk loads of a new set of instances and will not block +// if the instance is slow to respond or not reachable. +func (this *HttpAPI) AsyncDiscover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + go this.Discover(params, r, req, user) + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Asynchronous discovery initiated for Instance: %+v", instanceKey)}) +} + +// Discover issues a synchronous read on an instance +func (this *HttpAPI) Discover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.ReadTopologyInstance(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + if orcraft.IsRaftEnabled() { + orcraft.PublishCommand("discover", instanceKey) + } else { + logic.DiscoverInstance(instanceKey) + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance discovered: %+v", instance.Key), Details: instance}) +} + +// Refresh synchronuously re-reads a topology instance +func (this *HttpAPI) Refresh(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + _, err = inst.RefreshTopologyInstance(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance refreshed: %+v", instanceKey), Details: instanceKey}) +} + +// Forget removes an instance entry fro backend database +func (this *HttpAPI) Forget(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getNoResolveInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("forget", instanceKey) + } else { + err = inst.ForgetInstance(&instanceKey) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance forgotten: %+v", instanceKey), Details: instanceKey}) +} + +// ForgetCluster forgets all instacnes of a cluster +func (this *HttpAPI) ForgetCluster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + if orcraft.IsRaftEnabled() { + orcraft.PublishCommand("forget-cluster", clusterName) + } else { + inst.ForgetCluster(clusterName) + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Cluster forgotten: %+v", clusterName)}) +} + +// Resolve tries to resolve hostname and then checks to see if port is open on that host. +func (this *HttpAPI) Resolve(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + if conn, err := net.Dial("tcp", instanceKey.DisplayString()); err == nil { + conn.Close() + } else { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Instance resolved", Details: instanceKey}) +} + +// BeginMaintenance begins maintenance mode for given instance +func (this *HttpAPI) BeginMaintenance(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + key, err := inst.BeginBoundedMaintenance(&instanceKey, params["owner"], params["reason"], 0, true) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: key}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Maintenance begun: %+v", instanceKey), Details: instanceKey}) +} + +// EndMaintenance terminates maintenance mode +func (this *HttpAPI) EndMaintenance(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + maintenanceKey, err := strconv.ParseInt(params["maintenanceKey"], 10, 0) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + _, err = inst.EndMaintenance(maintenanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Maintenance ended: %+v", maintenanceKey), Details: maintenanceKey}) +} + +// EndMaintenanceByInstanceKey terminates maintenance mode for given instance +func (this *HttpAPI) EndMaintenanceByInstanceKey(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + _, err = inst.EndMaintenanceByInstanceKey(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Maintenance ended: %+v", instanceKey), Details: instanceKey}) +} + +// EndMaintenanceByInstanceKey terminates maintenance mode for given instance +func (this *HttpAPI) InMaintenance(params martini.Params, r render.Render, req *http.Request, user auth.User) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + inMaintenance, err := inst.InMaintenance(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + responseDetails := "" + if inMaintenance { + responseDetails = instanceKey.StringCode() + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%+v", inMaintenance), Details: responseDetails}) +} + +// Maintenance provides list of instance under active maintenance +func (this *HttpAPI) Maintenance(params martini.Params, r render.Render, req *http.Request) { + maintenanceList, err := inst.ReadActiveMaintenance() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, maintenanceList) +} + +// BeginDowntime sets a downtime flag with default duration +func (this *HttpAPI) BeginDowntime(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + var durationSeconds int = 0 + if params["duration"] != "" { + durationSeconds, err = util.SimpleTimeToSeconds(params["duration"]) + if durationSeconds < 0 { + err = fmt.Errorf("Duration value must be non-negative. Given value: %d", durationSeconds) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + } + duration := time.Duration(durationSeconds) * time.Second + downtime := inst.NewDowntime(&instanceKey, params["owner"], params["reason"], duration) + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("begin-downtime", downtime) + } else { + err = inst.BeginDowntime(downtime) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: instanceKey}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Downtime begun: %+v", instanceKey), Details: instanceKey}) +} + +// EndDowntime terminates downtime (removes downtime flag) for an instance +func (this *HttpAPI) EndDowntime(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("end-downtime", instanceKey) + } else { + _, err = inst.EndDowntime(&instanceKey) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Downtime ended: %+v", instanceKey), Details: instanceKey}) +} + +// MoveUp attempts to move an instance up the topology +func (this *HttpAPI) MoveUp(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.MoveUp(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v moved up", instanceKey), Details: instance}) +} + +// MoveUpReplicas attempts to move up all replicas of an instance +func (this *HttpAPI) MoveUpReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + replicas, newMaster, err, errs := inst.MoveUpReplicas(&instanceKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Moved up %d replicas of %+v below %+v; %d errors: %+v", len(replicas), instanceKey, newMaster.Key, len(errs), errs), Details: replicas}) +} + +// Repoint positiones a replica under another (or same) master with exact same coordinates. +// Useful for binlog servers +func (this *HttpAPI) Repoint(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.Repoint(&instanceKey, &belowKey, inst.GTIDHintNeutral) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v repointed below %+v", instanceKey, belowKey), Details: instance}) +} + +// MoveUpReplicas attempts to move up all replicas of an instance +func (this *HttpAPI) RepointReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + replicas, err, _ := inst.RepointReplicas(&instanceKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Repointed %d replicas of %+v", len(replicas), instanceKey), Details: replicas}) +} + +// MakeCoMaster attempts to make an instance co-master with its own master +func (this *HttpAPI) MakeCoMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.MakeCoMaster(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance made co-master: %+v", instance.Key), Details: instance}) +} + +// ResetReplication makes a replica forget about its master, effectively breaking the replication +func (this *HttpAPI) ResetReplication(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.ResetReplicationOperation(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica reset on %+v", instance.Key), Details: instance}) +} + +// DetachReplicaMasterHost detaches a replica from its master by setting an invalid +// (yet revertible) host name +func (this *HttpAPI) DetachReplicaMasterHost(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.DetachReplicaMasterHost(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica detached: %+v", instance.Key), Details: instance}) +} + +// ReattachReplicaMasterHost reverts a detachReplicaMasterHost command +// by resoting the original master hostname in CHANGE MASTER TO +func (this *HttpAPI) ReattachReplicaMasterHost(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.ReattachReplicaMasterHost(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica reattached: %+v", instance.Key), Details: instance}) +} + +// EnableGTID attempts to enable GTID on a replica +func (this *HttpAPI) EnableGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.EnableGTID(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Enabled GTID on %+v", instance.Key), Details: instance}) +} + +// DisableGTID attempts to disable GTID on a replica, and revert to binlog file:pos +func (this *HttpAPI) DisableGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.DisableGTID(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Disabled GTID on %+v", instance.Key), Details: instance}) +} + +// LocateErrantGTID identifies the binlog positions for errant GTIDs on an instance +func (this *HttpAPI) LocateErrantGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + errantBinlogs, err := inst.LocateErrantGTID(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("located errant GTID"), Details: errantBinlogs}) +} + +// ErrantGTIDResetMaster removes errant transactions on a server by way of RESET MASTER +func (this *HttpAPI) ErrantGTIDResetMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.ErrantGTIDResetMaster(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Removed errant GTID on %+v and issued a RESET MASTER", instance.Key), Details: instance}) +} + +// ErrantGTIDInjectEmpty removes errant transactions by injecting and empty transaction on the cluster's master +func (this *HttpAPI) ErrantGTIDInjectEmpty(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, clusterMaster, countInjectedTransactions, err := inst.ErrantGTIDInjectEmpty(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Have injected %+v transactions on cluster master %+v", countInjectedTransactions, clusterMaster.Key), Details: instance}) +} + +// MoveBelow attempts to move an instance below its supposed sibling +func (this *HttpAPI) MoveBelow(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + siblingKey, err := this.getInstanceKey(params["siblingHost"], params["siblingPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.MoveBelow(&instanceKey, &siblingKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v moved below %+v", instanceKey, siblingKey), Details: instance}) +} + +// MoveBelowGTID attempts to move an instance below another, via GTID +func (this *HttpAPI) MoveBelowGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.MoveBelowGTID(&instanceKey, &belowKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v moved below %+v via GTID", instanceKey, belowKey), Details: instance}) +} + +// MoveReplicasGTID attempts to move an instance below another, via GTID +func (this *HttpAPI) MoveReplicasGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + movedReplicas, _, err, errs := inst.MoveReplicasGTID(&instanceKey, &belowKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Moved %d replicas of %+v below %+v via GTID; %d errors: %+v", len(movedReplicas), instanceKey, belowKey, len(errs), errs), Details: belowKey}) +} + +// TakeSiblings +func (this *HttpAPI) TakeSiblings(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, count, err := inst.TakeSiblings(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Took %d siblings of %+v", count, instanceKey), Details: instance}) +} + +// TakeMaster +func (this *HttpAPI) TakeMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.TakeMaster(&instanceKey, false) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%+v took its master", instanceKey), Details: instance}) +} + +// RelocateBelow attempts to move an instance below another, orchestrator choosing the best (potentially multi-step) +// relocation method +func (this *HttpAPI) RelocateBelow(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.RelocateBelow(&instanceKey, &belowKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v relocated below %+v", instanceKey, belowKey), Details: instance}) +} + +// Relocates attempts to smartly relocate replicas of a given instance below another +func (this *HttpAPI) RelocateReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + replicas, _, err, errs := inst.RelocateReplicas(&instanceKey, &belowKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Relocated %d replicas of %+v below %+v; %d errors: %+v", len(replicas), instanceKey, belowKey, len(errs), errs), Details: replicas}) +} + +// MoveEquivalent attempts to move an instance below another, baseed on known equivalence master coordinates +func (this *HttpAPI) MoveEquivalent(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.MoveEquivalent(&instanceKey, &belowKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v relocated via equivalence coordinates below %+v", instanceKey, belowKey), Details: instance}) +} + +// LastPseudoGTID attempts to find the last pseugo-gtid entry in an instance +func (this *HttpAPI) LastPseudoGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, found, err := inst.ReadInstance(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if instance == nil || !found { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Instance not found: %+v", instanceKey)}) + return + } + coordinates, text, err := inst.FindLastPseudoGTIDEntry(instance, instance.RelaylogCoordinates, nil, false, nil) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%+v", *coordinates), Details: text}) +} + +// MatchBelow attempts to move an instance below another via pseudo GTID matching of binlog entries +func (this *HttpAPI) MatchBelow(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, matchedCoordinates, err := inst.MatchBelow(&instanceKey, &belowKey, true) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v matched below %+v at %+v", instanceKey, belowKey, *matchedCoordinates), Details: instance}) +} + +// MatchBelow attempts to move an instance below another via pseudo GTID matching of binlog entries +func (this *HttpAPI) MatchUp(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, matchedCoordinates, err := inst.MatchUp(&instanceKey, true) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v matched up at %+v", instanceKey, *matchedCoordinates), Details: instance}) +} + +// MultiMatchReplicas attempts to match all replicas of a given instance below another, efficiently +func (this *HttpAPI) MultiMatchReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + replicas, newMaster, err, errs := inst.MultiMatchReplicas(&instanceKey, &belowKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Matched %d replicas of %+v below %+v; %d errors: %+v", len(replicas), instanceKey, newMaster.Key, len(errs), errs), Details: newMaster.Key}) +} + +// MatchUpReplicas attempts to match up all replicas of an instance +func (this *HttpAPI) MatchUpReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + replicas, newMaster, err, errs := inst.MatchUpReplicas(&instanceKey, req.URL.Query().Get("pattern")) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Matched up %d replicas of %+v below %+v; %d errors: %+v", len(replicas), instanceKey, newMaster.Key, len(errs), errs), Details: newMaster.Key}) +} + +// RegroupReplicas attempts to pick a replica of a given instance and make it take its siblings, using any +// method possible (GTID, Pseudo-GTID, binlog servers) +func (this *HttpAPI) RegroupReplicas(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicas(&instanceKey, false, nil, nil) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("promoted replica: %s, lost: %d, trivial: %d, pseudo-gtid: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(equalReplicas), len(aheadReplicas)), Details: promotedReplica.Key}) +} + +// RegroupReplicas attempts to pick a replica of a given instance and make it take its siblings, efficiently, +// using pseudo-gtid if necessary +func (this *HttpAPI) RegroupReplicasPseudoGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + lostReplicas, equalReplicas, aheadReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasPseudoGTID(&instanceKey, false, nil, nil, nil) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("promoted replica: %s, lost: %d, trivial: %d, pseudo-gtid: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(equalReplicas), len(aheadReplicas)), Details: promotedReplica.Key}) +} + +// RegroupReplicasGTID attempts to pick a replica of a given instance and make it take its siblings, efficiently, using GTID +func (this *HttpAPI) RegroupReplicasGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + lostReplicas, movedReplicas, cannotReplicateReplicas, promotedReplica, err := inst.RegroupReplicasGTID(&instanceKey, false, nil, nil, nil) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("promoted replica: %s, lost: %d, moved: %d", + promotedReplica.Key.DisplayString(), len(lostReplicas), len(movedReplicas)), Details: promotedReplica.Key}) +} + +// RegroupReplicasBinlogServers attempts to pick a replica of a given instance and make it take its siblings, efficiently, using GTID +func (this *HttpAPI) RegroupReplicasBinlogServers(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + _, promotedBinlogServer, err := inst.RegroupReplicasBinlogServers(&instanceKey, false) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("promoted binlog server: %s", + promotedBinlogServer.Key.DisplayString()), Details: promotedBinlogServer.Key}) +} + +// MakeMaster attempts to make the given instance a master, and match its siblings to be its replicas +func (this *HttpAPI) MakeMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.MakeMaster(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v now made master", instanceKey), Details: instance}) +} + +// MakeLocalMaster attempts to make the given instance a local master: take over its master by +// enslaving its siblings and replicating from its grandparent. +func (this *HttpAPI) MakeLocalMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + instance, err := inst.MakeLocalMaster(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Instance %+v now made local master", instanceKey), Details: instance}) +} + +// SkipQuery skips a single query on a failed replication instance +func (this *HttpAPI) SkipQuery(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.SkipQuery(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Query skipped on %+v", instance.Key), Details: instance}) +} + +// StartReplication starts replication on given instance +func (this *HttpAPI) StartReplication(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.StartReplication(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica started: %+v", instance.Key), Details: instance}) +} + +// RestartReplication stops & starts replication on given instance +func (this *HttpAPI) RestartReplication(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.RestartReplication(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica restarted: %+v", instance.Key), Details: instance}) +} + +// StopReplication stops replication on given instance +func (this *HttpAPI) StopReplication(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.StopReplication(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica stopped: %+v", instance.Key), Details: instance}) +} + +// StopReplicationNicely stops replication on given instance, such that sql thead is aligned with IO thread +func (this *HttpAPI) StopReplicationNicely(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.StopReplicationNicely(&instanceKey, 0) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Replica stopped nicely: %+v", instance.Key), Details: instance}) +} + +// FlushBinaryLogs runs a single FLUSH BINARY LOGS +func (this *HttpAPI) FlushBinaryLogs(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.FlushBinaryLogs(&instanceKey, 1) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Binary logs flushed on: %+v", instance.Key), Details: instance}) +} + +// PurgeBinaryLogs purges binary logs up to given binlog file +func (this *HttpAPI) PurgeBinaryLogs(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + logFile := params["logFile"] + if logFile == "" { + Respond(r, &APIResponse{Code: ERROR, Message: "purge-binary-logs: expected log file name or 'latest'"}) + return + } + force := (req.URL.Query().Get("force") == "true") || (params["force"] == "true") + var instance *inst.Instance + if logFile == "latest" { + instance, err = inst.PurgeBinaryLogsToLatest(&instanceKey, force) + } else { + instance, err = inst.PurgeBinaryLogsTo(&instanceKey, logFile, force) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Binary logs flushed on: %+v", instance.Key), Details: instance}) +} + +// RestartReplicationStatements receives a query to execute that requires a replication restart to apply. +// As an example, this may be `set global rpl_semi_sync_slave_enabled=1`. orchestrator will check +// replication status on given host and will wrap with appropriate stop/start statements, if need be. +func (this *HttpAPI) RestartReplicationStatements(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + query := req.URL.Query().Get("q") + statements, err := inst.GetReplicationRestartPreserveStatements(&instanceKey, query) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("statements for: %+v", instanceKey), Details: statements}) +} + +// MasterEquivalent provides (possibly empty) list of master coordinates equivalent to the given ones +func (this *HttpAPI) MasterEquivalent(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + coordinates, err := this.getBinlogCoordinates(params["logFile"], params["logPos"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instanceCoordinates := &inst.InstanceBinlogCoordinates{Key: instanceKey, Coordinates: coordinates} + + equivalentCoordinates, err := inst.GetEquivalentMasterCoordinates(instanceCoordinates) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Found %+v equivalent coordinates", len(equivalentCoordinates)), Details: equivalentCoordinates}) +} + +// CanReplicateFrom attempts to move an instance below another via pseudo GTID matching of binlog entries +func (this *HttpAPI) CanReplicateFrom(params martini.Params, r render.Render, req *http.Request, user auth.User) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowInstance, found, err := inst.ReadInstance(&belowKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", belowKey)}) + return + } + + canReplicate, err := instance.CanReplicateFrom(belowInstance) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%t", canReplicate), Details: belowKey}) +} + +// CanReplicateFromGTID attempts to move an instance below another via GTID. +func (this *HttpAPI) CanReplicateFromGTID(params martini.Params, r render.Render, req *http.Request, user auth.User) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + belowKey, err := this.getInstanceKey(params["belowHost"], params["belowPort"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + belowInstance, found, err := inst.ReadInstance(&belowKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", belowKey)}) + return + } + + canReplicate, err := instance.CanReplicateFrom(belowInstance) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if !canReplicate { + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%t", canReplicate), Details: belowKey}) + return + } + err = inst.CheckMoveViaGTID(instance, belowInstance) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + canReplicate = (err == nil) + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%t", canReplicate), Details: belowKey}) +} + +// setSemiSyncMaster +func (this *HttpAPI) setSemiSyncMaster(params martini.Params, r render.Render, req *http.Request, user auth.User, enable bool) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.SetSemiSyncMaster(&instanceKey, enable) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("master semi-sync set to %t", enable), Details: instance}) +} + +func (this *HttpAPI) EnableSemiSyncMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.setSemiSyncMaster(params, r, req, user, true) +} +func (this *HttpAPI) DisableSemiSyncMaster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.setSemiSyncMaster(params, r, req, user, false) +} + +// setSemiSyncMaster +func (this *HttpAPI) setSemiSyncReplica(params martini.Params, r render.Render, req *http.Request, user auth.User, enable bool) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.SetSemiSyncReplica(&instanceKey, enable) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("replica semi-sync set to %t", enable), Details: instance}) +} + +func (this *HttpAPI) EnableSemiSyncReplica(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.setSemiSyncReplica(params, r, req, user, true) +} + +func (this *HttpAPI) DisableSemiSyncReplica(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.setSemiSyncReplica(params, r, req, user, false) +} + +// SetReadOnly sets the global read_only variable +func (this *HttpAPI) SetReadOnly(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.SetReadOnly(&instanceKey, true) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Server set as read-only", Details: instance}) +} + +// SetWriteable clear the global read_only variable +func (this *HttpAPI) SetWriteable(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.SetReadOnly(&instanceKey, false) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Server set as writeable", Details: instance}) +} + +// KillQuery kills a query running on a server +func (this *HttpAPI) KillQuery(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + processId, err := strconv.ParseInt(params["process"], 10, 0) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, err := inst.KillQuery(&instanceKey, processId) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Query killed on : %+v", instance.Key), Details: instance}) +} + +// AsciiTopology returns an ascii graph of cluster's instances +func (this *HttpAPI) asciiTopology(params martini.Params, r render.Render, req *http.Request, tabulated bool, printTags bool) { + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + asciiOutput, err := inst.ASCIITopology(clusterName, "", tabulated, printTags) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Topology for cluster %s", clusterName), Details: asciiOutput}) +} + +// SnapshotTopologies triggers orchestrator to record a snapshot of host/master for all known hosts. +func (this *HttpAPI) SnapshotTopologies(params martini.Params, r render.Render, req *http.Request) { + start := time.Now() + if err := inst.SnapshotTopologies(); err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err), Details: fmt.Sprintf("Took %v", time.Since(start))}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Topology Snapshot completed", Details: fmt.Sprintf("Took %v", time.Since(start))}) +} + +// AsciiTopology returns an ascii graph of cluster's instances +func (this *HttpAPI) AsciiTopology(params martini.Params, r render.Render, req *http.Request) { + this.asciiTopology(params, r, req, false, false) +} + +// AsciiTopology returns an ascii graph of cluster's instances +func (this *HttpAPI) AsciiTopologyTabulated(params martini.Params, r render.Render, req *http.Request) { + this.asciiTopology(params, r, req, true, false) +} + +// AsciiTopologyTags returns an ascii graph of cluster's instances and instance tags +func (this *HttpAPI) AsciiTopologyTags(params martini.Params, r render.Render, req *http.Request) { + this.asciiTopology(params, r, req, false, true) +} + +// Cluster provides list of instances in given cluster +func (this *HttpAPI) Cluster(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + instances, err := inst.ReadClusterInstances(clusterName) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// ClusterByAlias provides list of instances in given cluster +func (this *HttpAPI) ClusterByAlias(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := inst.GetClusterByAlias(params["clusterAlias"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + params["clusterName"] = clusterName + this.Cluster(params, r, req) +} + +// ClusterByInstance provides list of instances in cluster an instance belongs to +func (this *HttpAPI) ClusterByInstance(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + + params["clusterName"] = instance.ClusterName + this.Cluster(params, r, req) +} + +// ClusterInfo provides details of a given cluster +func (this *HttpAPI) ClusterInfo(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + clusterInfo, err := inst.ReadClusterInfo(clusterName) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, clusterInfo) +} + +// Cluster provides list of instances in given cluster +func (this *HttpAPI) ClusterInfoByAlias(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := inst.GetClusterByAlias(params["clusterAlias"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + params["clusterName"] = clusterName + this.ClusterInfo(params, r, req) +} + +// ClusterOSCReplicas returns heuristic list of OSC replicas +func (this *HttpAPI) ClusterOSCReplicas(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + instances, err := inst.GetClusterOSCReplicas(clusterName) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// SetClusterAlias will change an alias for a given clustername +func (this *HttpAPI) SetClusterAliasManualOverride(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName := params["clusterName"] + alias := req.URL.Query().Get("alias") + + var err error + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("set-cluster-alias-manual-override", []string{clusterName, alias}) + } else { + err = inst.SetClusterAliasManualOverride(clusterName, alias) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Cluster %s now has alias '%s'", clusterName, alias)}) +} + +// Clusters provides list of known clusters +func (this *HttpAPI) Clusters(params martini.Params, r render.Render, req *http.Request) { + clusterNames, err := inst.ReadClusters() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, clusterNames) +} + +// ClustersInfo provides list of known clusters, along with some added metadata per cluster +func (this *HttpAPI) ClustersInfo(params martini.Params, r render.Render, req *http.Request) { + clustersInfo, err := inst.ReadClustersInfo("") + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, clustersInfo) +} + +// Tags lists existing tags for a given instance +func (this *HttpAPI) Tags(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + tags, err := inst.ReadInstanceTags(&instanceKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + tagStrings := []string{} + for _, tag := range tags { + tagStrings = append(tagStrings, tag.String()) + } + r.JSON(http.StatusOK, tagStrings) +} + +// TagValue returns a given tag's value for a specific instance +func (this *HttpAPI) TagValue(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + tag, err := getTag(params, req) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + tagExists, err := inst.ReadInstanceTag(&instanceKey, tag) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if tagExists { + r.JSON(http.StatusOK, tag.TagValue) + } else { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("tag %s not found for %+v", tag.TagName, instanceKey)}) + } +} + +// Tagged return instance keys tagged by "tag" query param +func (this *HttpAPI) Tagged(params martini.Params, r render.Render, req *http.Request) { + tagsString := req.URL.Query().Get("tag") + instanceKeyMap, err := inst.GetInstanceKeysByTags(tagsString) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + r.JSON(http.StatusOK, instanceKeyMap.GetInstanceKeys()) +} + +// Tags adds a tag to a given instance +func (this *HttpAPI) Tag(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + tag, err := getTag(params, req) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("put-instance-tag", inst.InstanceTag{Key: instanceKey, T: *tag}) + } else { + err = inst.PutInstanceTag(&instanceKey, tag) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%+v tagged with %s", instanceKey, tag.String()), Details: instanceKey}) +} + +// Untag removes a tag from an instance +func (this *HttpAPI) Untag(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + tag, err := getTag(params, req) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + untagged, err := inst.Untag(&instanceKey, tag) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%s removed from %+v instances", tag.TagName, len(*untagged)), Details: untagged.GetInstanceKeys()}) +} + +// UntagAll removes a tag from all matching instances +func (this *HttpAPI) UntagAll(params martini.Params, r render.Render, req *http.Request) { + tag, err := getTag(params, req) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + untagged, err := inst.Untag(nil, tag) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("%s removed from %+v instances", tag.TagName, len(*untagged)), Details: untagged.GetInstanceKeys()}) +} + +// Write a cluster's master (or all clusters masters) to kv stores. +// This should generally only happen once in a lifetime of a cluster. Otherwise KV +// stores are updated via failovers. +func (this *HttpAPI) SubmitMastersToKvStores(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := getClusterNameIfExists(params) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + kvPairs, submittedCount, err := logic.SubmitMastersToKvStores(clusterName, true) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Submitted %d masters", submittedCount), Details: kvPairs}) +} + +// Clusters provides list of known masters +func (this *HttpAPI) Masters(params martini.Params, r render.Render, req *http.Request) { + instances, err := inst.ReadWriteableClustersMasters() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// ClusterMaster returns the writable master of a given cluster +func (this *HttpAPI) ClusterMaster(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + masters, err := inst.ReadClusterMaster(clusterName) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + if len(masters) == 0 { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("No masters found for %+v", clusterName)}) + return + } + + r.JSON(http.StatusOK, masters[0]) +} + +// Downtimed lists downtimed instances, potentially filtered by cluster +func (this *HttpAPI) Downtimed(params martini.Params, r render.Render, req *http.Request) { + clusterName, err := getClusterNameIfExists(params) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + instances, err := inst.ReadDowntimedInstances(clusterName) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// AllInstances lists all known instances +func (this *HttpAPI) AllInstances(params martini.Params, r render.Render, req *http.Request) { + instances, err := inst.SearchInstances("") + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// Search provides list of instances matching given search param via various criteria. +func (this *HttpAPI) Search(params martini.Params, r render.Render, req *http.Request) { + searchString := params["searchString"] + if searchString == "" { + searchString = req.URL.Query().Get("s") + } + instances, err := inst.SearchInstances(searchString) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// Problems provides list of instances with known problems +func (this *HttpAPI) Problems(params martini.Params, r render.Render, req *http.Request) { + clusterName := params["clusterName"] + instances, err := inst.ReadProblemInstances(clusterName) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// Audit provides list of audit entries by given page number +func (this *HttpAPI) Audit(params martini.Params, r render.Render, req *http.Request) { + page, err := strconv.Atoi(params["page"]) + if err != nil || page < 0 { + page = 0 + } + var auditedInstanceKey *inst.InstanceKey + if instanceKey, err := this.getInstanceKey(params["host"], params["port"]); err == nil { + auditedInstanceKey = &instanceKey + } + + audits, err := inst.ReadRecentAudit(auditedInstanceKey, page) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, audits) +} + +// HostnameResolveCache shows content of in-memory hostname cache +func (this *HttpAPI) HostnameResolveCache(params martini.Params, r render.Render, req *http.Request) { + content, err := inst.HostnameResolveCache() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Cache retrieved", Details: content}) +} + +// ResetHostnameResolveCache clears in-memory hostname resovle cache +func (this *HttpAPI) ResetHostnameResolveCache(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + err := inst.ResetHostnameResolveCache() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Hostname cache cleared"}) +} + +// DeregisterHostnameUnresolve deregisters the unresolve name used previously +func (this *HttpAPI) DeregisterHostnameUnresolve(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + var instanceKey *inst.InstanceKey + if instKey, err := this.getInstanceKey(params["host"], params["port"]); err == nil { + instanceKey = &instKey + } + + var err error + registration := inst.NewHostnameDeregistration(instanceKey) + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("register-hostname-unresolve", registration) + } else { + err = inst.RegisterHostnameUnresolve(registration) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + Respond(r, &APIResponse{Code: OK, Message: "Hostname deregister unresolve completed", Details: instanceKey}) +} + +// RegisterHostnameUnresolve registers the unresolve name to use +func (this *HttpAPI) RegisterHostnameUnresolve(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + var instanceKey *inst.InstanceKey + if instKey, err := this.getInstanceKey(params["host"], params["port"]); err == nil { + instanceKey = &instKey + } + + hostname := params["virtualname"] + var err error + registration := inst.NewHostnameRegistration(instanceKey, hostname) + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("register-hostname-unresolve", registration) + } else { + err = inst.RegisterHostnameUnresolve(registration) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + Respond(r, &APIResponse{Code: OK, Message: "Hostname register unresolve completed", Details: instanceKey}) +} + +// SubmitPoolInstances (re-)applies the list of hostnames for a given pool +func (this *HttpAPI) SubmitPoolInstances(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + pool := params["pool"] + instances := req.URL.Query().Get("instances") + + var err error + submission := inst.NewPoolInstancesSubmission(pool, instances) + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("submit-pool-instances", submission) + } else { + err = inst.ApplyPoolInstances(submission) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Applied %s pool instances", pool), Details: pool}) +} + +// SubmitPoolHostnames (re-)applies the list of hostnames for a given pool +func (this *HttpAPI) ReadClusterPoolInstancesMap(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName := params["clusterName"] + pool := params["pool"] + + poolInstancesMap, err := inst.ReadClusterPoolInstancesMap(clusterName, pool) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Read pool instances for cluster %s", clusterName), Details: poolInstancesMap}) +} + +// GetHeuristicClusterPoolInstances returns instances belonging to a cluster's pool +func (this *HttpAPI) GetHeuristicClusterPoolInstances(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + pool := params["pool"] + + instances, err := inst.GetHeuristicClusterPoolInstances(clusterName, pool) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Heuristic pool instances for cluster %s", clusterName), Details: instances}) +} + +// GetHeuristicClusterPoolInstances returns instances belonging to a cluster's pool +func (this *HttpAPI) GetHeuristicClusterPoolInstancesLag(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := inst.ReadClusterNameByAlias(params["clusterName"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + pool := params["pool"] + + lag, err := inst.GetHeuristicClusterPoolInstancesLag(clusterName, pool) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Heuristic pool lag for cluster %s", clusterName), Details: lag}) +} + +// ReloadClusterAlias clears in-memory hostname resovle cache +func (this *HttpAPI) ReloadClusterAlias(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + Respond(r, &APIResponse{Code: ERROR, Message: "This API call has been retired"}) +} + +// BulkPromotionRules returns a list of the known promotion rules for each instance +func (this *HttpAPI) BulkPromotionRules(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + promotionRules, err := inst.BulkReadCandidateDatabaseInstance() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, promotionRules) +} + +// BulkInstances returns a list of all known instances +func (this *HttpAPI) BulkInstances(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + instances, err := inst.BulkReadInstance() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, instances) +} + +// DiscoveryMetricsRaw will return the last X seconds worth of discovery information in time based order as a JSON array +func (this *HttpAPI) DiscoveryMetricsRaw(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + if err != nil || seconds <= 0 { + Respond(r, &APIResponse{Code: ERROR, Message: "Invalid value provided for seconds"}) + return + } + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + json, err := discovery.JSONSince(discoveryMetrics, refTime) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to determine start time. Perhaps seconds value is wrong?"}) + return + } + log.Debugf("DiscoveryMetricsRaw data: retrieved %d entries from discovery.MC", len(json)) + + r.JSON(http.StatusOK, json) +} + +// DiscoveryMetricsAggregated will return a single set of aggregated metrics for raw values collected since the +// specified time. +func (this *HttpAPI) DiscoveryMetricsAggregated(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + aggregated, err := discovery.AggregatedSince(discoveryMetrics, refTime) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to generate aggregated discovery metrics"}) + return + } + // log.Debugf("DiscoveryMetricsAggregated data: %+v", aggregated) + r.JSON(http.StatusOK, aggregated) +} + +// DiscoveryQueueMetricsRaw returns the raw queue metrics (active and +// queued values), data taken secondly for the last N seconds. +func (this *HttpAPI) DiscoveryQueueMetricsRaw(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("DiscoveryQueueMetricsRaw: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to generate discovery queue aggregated metrics"}) + return + } + + queue := discovery.CreateOrReturnQueue("DEFAULT") + metrics := queue.DiscoveryQueueMetrics(seconds) + log.Debugf("DiscoveryQueueMetricsRaw data: %+v", metrics) + + r.JSON(http.StatusOK, metrics) +} + +// DiscoveryQueueMetricsAggregated returns a single value showing the metrics of the discovery queue over the last N seconds. +// This is expected to be called every 60 seconds (?) and the config setting of the retention period is currently hard-coded. +// See go/discovery/ for more information. +func (this *HttpAPI) DiscoveryQueueMetricsAggregated(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("DiscoveryQueueMetricsAggregated: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to generate discovery queue aggregated metrics"}) + return + } + + queue := discovery.CreateOrReturnQueue("DEFAULT") + aggregated := queue.AggregatedDiscoveryQueueMetrics(seconds) + log.Debugf("DiscoveryQueueMetricsAggregated data: %+v", aggregated) + + r.JSON(http.StatusOK, aggregated) +} + +// BackendQueryMetricsRaw returns the raw backend query metrics +func (this *HttpAPI) BackendQueryMetricsRaw(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("BackendQueryMetricsRaw: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to generate raw backend query metrics"}) + return + } + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + m, err := queryMetrics.Since(refTime) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to return backend query metrics"}) + return + } + + log.Debugf("BackendQueryMetricsRaw data: %+v", m) + + r.JSON(http.StatusOK, m) +} + +func (this *HttpAPI) BackendQueryMetricsAggregated(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("BackendQueryMetricsAggregated: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to aggregated generate backend query metrics"}) + return + } + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + aggregated := query.AggregatedSince(queryMetrics, refTime) + log.Debugf("BackendQueryMetricsAggregated data: %+v", aggregated) + + r.JSON(http.StatusOK, aggregated) +} + +// WriteBufferMetricsRaw returns the raw instance write buffer metrics +func (this *HttpAPI) WriteBufferMetricsRaw(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("WriteBufferMetricsRaw: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to generate raw instance write buffer metrics"}) + return + } + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + m, err := writeBufferMetrics.Since(refTime) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to return instance write buffermetrics"}) + return + } + + log.Debugf("WriteBufferMetricsRaw data: %+v", m) + + r.JSON(http.StatusOK, m) +} + +// WriteBufferMetricsAggregated provides aggregate metrics of instance write buffer metrics +func (this *HttpAPI) WriteBufferMetricsAggregated(params martini.Params, r render.Render, req *http.Request, user auth.User) { + seconds, err := strconv.Atoi(params["seconds"]) + log.Debugf("WriteBufferMetricsAggregated: seconds: %d", seconds) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Unable to aggregated instance write buffer metrics"}) + return + } + + refTime := time.Now().Add(-time.Duration(seconds) * time.Second) + aggregated := inst.AggregatedSince(writeBufferMetrics, refTime) + log.Debugf("WriteBufferMetricsAggregated data: %+v", aggregated) + + r.JSON(http.StatusOK, aggregated) +} + +// Agents provides complete list of registered agents (See https://github.com/openark/orchestrator-agent) +func (this *HttpAPI) Agents(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + agents, err := agent.ReadAgents() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, agents) +} + +// Agent returns complete information of a given agent +func (this *HttpAPI) Agent(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + agent, err := agent.GetAgent(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, agent) +} + +// AgentUnmount instructs an agent to unmount the designated mount point +func (this *HttpAPI) AgentUnmount(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.Unmount(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentMountLV instructs an agent to mount a given volume on the designated mount point +func (this *HttpAPI) AgentMountLV(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.MountLV(params["host"], req.URL.Query().Get("lv")) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentCreateSnapshot instructs an agent to create a new snapshot. Agent's DIY implementation. +func (this *HttpAPI) AgentCreateSnapshot(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.CreateSnapshot(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentRemoveLV instructs an agent to remove a logical volume +func (this *HttpAPI) AgentRemoveLV(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.RemoveLV(params["host"], req.URL.Query().Get("lv")) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentMySQLStop stops MySQL service on agent +func (this *HttpAPI) AgentMySQLStop(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.MySQLStop(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentMySQLStart starts MySQL service on agent +func (this *HttpAPI) AgentMySQLStart(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.MySQLStart(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +func (this *HttpAPI) AgentCustomCommand(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.CustomCommand(params["host"], params["command"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentSeed completely seeds a host with another host's snapshots. This is a complex operation +// governed by orchestrator and executed by the two agents involved. +func (this *HttpAPI) AgentSeed(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.Seed(params["targetHost"], params["sourceHost"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentActiveSeeds lists active seeds and their state +func (this *HttpAPI) AgentActiveSeeds(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.ReadActiveSeedsForHost(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentRecentSeeds lists recent seeds of a given agent +func (this *HttpAPI) AgentRecentSeeds(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.ReadRecentCompletedSeedsForHost(params["host"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentSeedDetails provides details of a given seed +func (this *HttpAPI) AgentSeedDetails(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + seedId, err := strconv.ParseInt(params["seedId"], 10, 0) + output, err := agent.AgentSeedDetails(seedId) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AgentSeedStates returns the breakdown of states (steps) of a given seed +func (this *HttpAPI) AgentSeedStates(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + seedId, err := strconv.ParseInt(params["seedId"], 10, 0) + output, err := agent.ReadSeedStates(seedId) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// Seeds retruns all recent seeds +func (this *HttpAPI) Seeds(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + output, err := agent.ReadRecentSeeds() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, output) +} + +// AbortSeed instructs agents to abort an active seed +func (this *HttpAPI) AbortSeed(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !config.Config.ServeAgentsHttp { + Respond(r, &APIResponse{Code: ERROR, Message: "Agents not served"}) + return + } + + seedId, err := strconv.ParseInt(params["seedId"], 10, 0) + err = agent.AbortSeed(seedId) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, err == nil) +} + +// Headers is a self-test call which returns HTTP headers +func (this *HttpAPI) Headers(params martini.Params, r render.Render, req *http.Request) { + r.JSON(http.StatusOK, req.Header) +} + +// Health performs a self test +func (this *HttpAPI) Health(params martini.Params, r render.Render, req *http.Request) { + health, err := process.HealthTest() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Application node is unhealthy %+v", err), Details: health}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Application node is healthy"), Details: health}) + +} + +// LBCheck returns a constant respnse, and this can be used by load balancers that expect a given string. +func (this *HttpAPI) LBCheck(params martini.Params, r render.Render, req *http.Request) { + r.JSON(http.StatusOK, "OK") +} + +// LBCheck returns a constant respnse, and this can be used by load balancers that expect a given string. +func (this *HttpAPI) LeaderCheck(params martini.Params, r render.Render, req *http.Request) { + respondStatus, err := strconv.Atoi(params["errorStatusCode"]) + if err != nil || respondStatus < 0 { + respondStatus = http.StatusNotFound + } + + if logic.IsLeader() { + r.JSON(http.StatusOK, "OK") + } else { + r.JSON(respondStatus, "Not leader") + } +} + +// A configurable endpoint that can be for regular status checks or whatever. While similar to +// Health() this returns 500 on failure. This will prevent issues for those that have come to +// expect a 200 +// It might be a good idea to deprecate the current Health() behavior and roll this in at some +// point +func (this *HttpAPI) StatusCheck(params martini.Params, r render.Render, req *http.Request) { + health, err := process.HealthTest() + if err != nil { + r.JSON(500, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Application node is unhealthy %+v", err), Details: health}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Application node is healthy"), Details: health}) +} + +// GrabElection forcibly grabs leadership. Use with care!! +func (this *HttpAPI) GrabElection(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + err := process.GrabElection() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Unable to grab election: %+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Node elected as leader")}) +} + +// Reelect causes re-elections for an active node +func (this *HttpAPI) Reelect(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + err := process.Reelect() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Unable to re-elect: %+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Set re-elections")}) +} + +// RaftAddPeer adds a new node to the raft cluster +func (this *HttpAPI) RaftAddPeer(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-add-peer: not running with raft setup"}) + return + } + addr, err := orcraft.AddPeer(params["addr"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot add raft peer: %+v", err)}) + return + } + + r.JSON(http.StatusOK, addr) +} + +// RaftAddPeer removes a node fro the raft cluster +func (this *HttpAPI) RaftRemovePeer(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-remove-peer: not running with raft setup"}) + return + } + addr, err := orcraft.RemovePeer(params["addr"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot remove raft peer: %+v", err)}) + return + } + + r.JSON(http.StatusOK, addr) +} + +// RaftYield yields to a specified host +func (this *HttpAPI) RaftYield(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-yield: not running with raft setup"}) + return + } + orcraft.PublishYield(params["node"]) + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Asynchronously yielded")}) +} + +// RaftYieldHint yields to a host whose name contains given hint (e.g. DC) +func (this *HttpAPI) RaftYieldHint(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-yield-hint: not running with raft setup"}) + return + } + hint := params["hint"] + orcraft.PublishYieldHostnameHint(hint) + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Asynchronously yielded by hint %s", hint), Details: hint}) +} + +// RaftPeers returns the list of peers in a raft setup +func (this *HttpAPI) RaftPeers(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-nodes: not running with raft setup"}) + return + } + + peers, err := orcraft.GetPeers() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get raft peers: %+v", err)}) + return + } + + r.JSON(http.StatusOK, peers) +} + +// RaftState returns the state of this raft node +func (this *HttpAPI) RaftState(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-state: not running with raft setup"}) + return + } + + state := orcraft.GetState().String() + r.JSON(http.StatusOK, state) +} + +// RaftLeader returns the identify of the leader, if possible +func (this *HttpAPI) RaftLeader(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-leader: not running with raft setup"}) + return + } + + leader := orcraft.GetLeader() + r.JSON(http.StatusOK, leader) +} + +// RaftHealth indicates whether this node is part of a healthy raft group +func (this *HttpAPI) RaftHealth(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-state: not running with raft setup"}) + return + } + if !orcraft.IsHealthy() { + Respond(r, &APIResponse{Code: ERROR, Message: "unhealthy"}) + return + } + r.JSON(http.StatusOK, "healthy") +} + +// RaftStatus exports a status summary for a raft node +func (this *HttpAPI) RaftStatus(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-state: not running with raft setup"}) + return + } + peers, err := orcraft.GetPeers() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get raft peers: %+v", err)}) + return + } + + status := struct { + RaftBind string + RaftAdvertise string + State string + Healthy bool + IsPartOfQuorum bool + Leader string + LeaderURI string + Peers []string + }{ + RaftBind: orcraft.GetRaftBind(), + RaftAdvertise: orcraft.GetRaftAdvertise(), + State: orcraft.GetState().String(), + Healthy: orcraft.IsHealthy(), + IsPartOfQuorum: orcraft.IsPartOfQuorum(), + Leader: orcraft.GetLeader(), + LeaderURI: orcraft.LeaderURI.Get(), + Peers: peers, + } + r.JSON(http.StatusOK, status) +} + +// RaftFollowerHealthReport is initiated by followers to report their identity and health to the raft leader. +func (this *HttpAPI) RaftFollowerHealthReport(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-state: not running with raft setup"}) + return + } + err := orcraft.OnHealthReport(params["authenticationToken"], params["raftBind"], params["raftAdvertise"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot create snapshot: %+v", err)}) + return + } + r.JSON(http.StatusOK, "health reported") +} + +// RaftSnapshot instructs raft to take a snapshot +func (this *HttpAPI) RaftSnapshot(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !orcraft.IsRaftEnabled() { + Respond(r, &APIResponse{Code: ERROR, Message: "raft-leader: not running with raft setup"}) + return + } + err := orcraft.Snapshot() + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot create snapshot: %+v", err)}) + return + } + r.JSON(http.StatusOK, "snapshot created") +} + +// ReloadConfiguration reloads confiug settings (not all of which will apply after change) +func (this *HttpAPI) ReloadConfiguration(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + extraConfigFile := req.URL.Query().Get("config") + config.Reload(extraConfigFile) + inst.AuditOperation("reload-configuration", nil, "Triggered via API") + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Config reloaded"), Details: extraConfigFile}) +} + +// ReplicationAnalysis retuens list of issues +func (this *HttpAPI) replicationAnalysis(clusterName string, instanceKey *inst.InstanceKey, params martini.Params, r render.Render, req *http.Request) { + analysis, err := inst.GetReplicationAnalysis(clusterName, &inst.ReplicationAnalysisHints{IncludeDowntimed: true}) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get analysis: %+v", err)}) + return + } + // Possibly filter single instance + if instanceKey != nil { + filtered := analysis[:0] + for _, analysisEntry := range analysis { + if instanceKey.Equals(&analysisEntry.AnalyzedInstanceKey) { + filtered = append(filtered, analysisEntry) + } + } + analysis = filtered + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Analysis"), Details: analysis}) +} + +// ReplicationAnalysis retuens list of issues +func (this *HttpAPI) ReplicationAnalysis(params martini.Params, r render.Render, req *http.Request) { + this.replicationAnalysis("", nil, params, r, req) +} + +// ReplicationAnalysis retuens list of issues +func (this *HttpAPI) ReplicationAnalysisForCluster(params martini.Params, r render.Render, req *http.Request) { + clusterName := params["clusterName"] + + var err error + if clusterName, err = inst.DeduceClusterName(params["clusterName"]); err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get analysis: %+v", err)}) + return + } + if clusterName == "" { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get cluster name: %+v", params["clusterName"])}) + return + } + this.replicationAnalysis(clusterName, nil, params, r, req) +} + +// ReplicationAnalysis retuens list of issues +func (this *HttpAPI) ReplicationAnalysisForKey(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get analysis: %+v", err)}) + return + } + if !instanceKey.IsValid() { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot get analysis: invalid key %+v", instanceKey)}) + return + } + this.replicationAnalysis("", &instanceKey, params, r, req) +} + +// RecoverLite attempts recovery on a given instance, without executing external processes +func (this *HttpAPI) RecoverLite(params martini.Params, r render.Render, req *http.Request, user auth.User) { + params["skipProcesses"] = "true" + this.Recover(params, r, req, user) +} + +// Recover attempts recovery on a given instance +func (this *HttpAPI) Recover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + var candidateKey *inst.InstanceKey + if key, err := this.getInstanceKey(params["candidateHost"], params["candidatePort"]); err == nil { + candidateKey = &key + } + + skipProcesses := (req.URL.Query().Get("skipProcesses") == "true") || (params["skipProcesses"] == "true") + recoveryAttempted, promotedInstanceKey, err := logic.CheckAndRecover(&instanceKey, candidateKey, skipProcesses) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: instanceKey}) + return + } + if !recoveryAttempted { + Respond(r, &APIResponse{Code: ERROR, Message: "Recovery not attempted", Details: instanceKey}) + return + } + if promotedInstanceKey == nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Recovery attempted but no instance promoted", Details: instanceKey}) + return + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Recovery executed on %+v", instanceKey), Details: *promotedInstanceKey}) +} + +// GracefulMasterTakeover gracefully fails over a master onto its single replica. +func (this *HttpAPI) gracefulMasterTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User, auto bool) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + designatedKey, _ := this.getInstanceKey(params["designatedHost"], params["designatedPort"]) + // designatedKey may be empty/invalid + topologyRecovery, _, err := logic.GracefulMasterTakeover(clusterName, &designatedKey, auto) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error(), Details: topologyRecovery}) + return + } + if topologyRecovery == nil || topologyRecovery.SuccessorKey == nil { + Respond(r, &APIResponse{Code: ERROR, Message: "graceful-master-takeover: no successor promoted", Details: topologyRecovery}) + return + } + Respond(r, &APIResponse{Code: OK, Message: "graceful-master-takeover: successor promoted", Details: topologyRecovery}) +} + +// GracefulMasterTakeover gracefully fails over a master, either: +// - onto its single replica, or +// - onto a replica indicated by the user +func (this *HttpAPI) GracefulMasterTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.gracefulMasterTakeover(params, r, req, user, false) +} + +// GracefulMasterTakeoverAuto gracefully fails over a master onto a replica of orchestrator's choosing +func (this *HttpAPI) GracefulMasterTakeoverAuto(params martini.Params, r render.Render, req *http.Request, user auth.User) { + this.gracefulMasterTakeover(params, r, req, user, true) +} + +// ForceMasterFailover fails over a master (even if there's no particular problem with the master) +func (this *HttpAPI) ForceMasterFailover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + topologyRecovery, err := logic.ForceMasterFailover(clusterName) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if topologyRecovery.SuccessorKey != nil { + Respond(r, &APIResponse{Code: OK, Message: "Master failed over", Details: topologyRecovery}) + } else { + Respond(r, &APIResponse{Code: ERROR, Message: "Master not failed over", Details: topologyRecovery}) + } +} + +// ForceMasterTakeover fails over a master (even if there's no particular problem with the master) +func (this *HttpAPI) ForceMasterTakeover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + clusterName, err := figureClusterName(getClusterHint(params)) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + designatedKey, _ := this.getInstanceKey(params["designatedHost"], params["designatedPort"]) + designatedInstance, _, err := inst.ReadInstance(&designatedKey) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if designatedInstance == nil { + Respond(r, &APIResponse{Code: ERROR, Message: "Instance not found"}) + return + } + + topologyRecovery, err := logic.ForceMasterTakeover(clusterName, designatedInstance) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + if topologyRecovery.SuccessorKey != nil { + Respond(r, &APIResponse{Code: OK, Message: "Master failed over", Details: topologyRecovery}) + } else { + Respond(r, &APIResponse{Code: ERROR, Message: "Master not failed over", Details: topologyRecovery}) + } +} + +// Registers promotion preference for given instance +func (this *HttpAPI) RegisterCandidate(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + promotionRule, err := inst.ParseCandidatePromotionRule(params["promotionRule"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + candidate := inst.NewCandidateDatabaseInstance(&instanceKey, promotionRule).WithCurrentTime() + + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("register-candidate", candidate) + } else { + err = inst.RegisterCandidateInstance(candidate) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Registered candidate", Details: instanceKey}) +} + +// AutomatedRecoveryFilters retuens list of clusters which are configured with automated recovery +func (this *HttpAPI) AutomatedRecoveryFilters(params martini.Params, r render.Render, req *http.Request) { + automatedRecoveryMap := make(map[string]interface{}) + automatedRecoveryMap["RecoverMasterClusterFilters"] = config.Config.RecoverMasterClusterFilters + automatedRecoveryMap["RecoverIntermediateMasterClusterFilters"] = config.Config.RecoverIntermediateMasterClusterFilters + automatedRecoveryMap["RecoveryIgnoreHostnameFilters"] = config.Config.RecoveryIgnoreHostnameFilters + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Automated recovery configuration details"), Details: automatedRecoveryMap}) +} + +// AuditFailureDetection provides list of topology_failure_detection entries +func (this *HttpAPI) AuditFailureDetection(params martini.Params, r render.Render, req *http.Request) { + + var audits []logic.TopologyRecovery + var err error + + if detectionId, derr := strconv.ParseInt(params["id"], 10, 0); derr == nil && detectionId > 0 { + audits, err = logic.ReadFailureDetection(detectionId) + } else { + page, derr := strconv.Atoi(params["page"]) + if derr != nil || page < 0 { + page = 0 + } + audits, err = logic.ReadRecentFailureDetections(params["clusterAlias"], page) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, audits) +} + +// AuditRecoverySteps returns audited steps of a given recovery +func (this *HttpAPI) AuditRecoverySteps(params martini.Params, r render.Render, req *http.Request) { + recoveryUID := params["uid"] + audits, err := logic.ReadTopologyRecoverySteps(recoveryUID) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, audits) +} + +// ReadReplicationAnalysisChangelog lists instances and their analysis changelog +func (this *HttpAPI) ReadReplicationAnalysisChangelog(params martini.Params, r render.Render, req *http.Request) { + changelogs, err := inst.ReadReplicationAnalysisChangelog() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, changelogs) +} + +// AuditRecovery provides list of topology-recovery entries +func (this *HttpAPI) AuditRecovery(params martini.Params, r render.Render, req *http.Request) { + var audits []logic.TopologyRecovery + var err error + + if recoveryUID := params["uid"]; recoveryUID != "" { + audits, err = logic.ReadRecoveryByUID(recoveryUID) + } else if recoveryId, derr := strconv.ParseInt(params["id"], 10, 0); derr == nil && recoveryId > 0 { + audits, err = logic.ReadRecovery(recoveryId) + } else { + page, derr := strconv.Atoi(params["page"]) + if derr != nil || page < 0 { + page = 0 + } + unacknowledgedOnly := (req.URL.Query().Get("unacknowledged") == "true") + + audits, err = logic.ReadRecentRecoveries(params["clusterName"], params["clusterAlias"], unacknowledgedOnly, page) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, audits) +} + +// ActiveClusterRecovery returns recoveries in-progress for a given cluster +func (this *HttpAPI) ActiveClusterRecovery(params martini.Params, r render.Render, req *http.Request) { + recoveries, err := logic.ReadActiveClusterRecovery(params["clusterName"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, recoveries) +} + +// RecentlyActiveClusterRecovery returns recoveries in-progress for a given cluster +func (this *HttpAPI) RecentlyActiveClusterRecovery(params martini.Params, r render.Render, req *http.Request) { + recoveries, err := logic.ReadRecentlyActiveClusterRecovery(params["clusterName"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, recoveries) +} + +// RecentlyActiveClusterRecovery returns recoveries in-progress for a given cluster +func (this *HttpAPI) RecentlyActiveInstanceRecovery(params martini.Params, r render.Render, req *http.Request) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + recoveries, err := logic.ReadRecentlyActiveInstanceRecovery(&instanceKey) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, recoveries) +} + +// ClusterInfo provides details of a given cluster +func (this *HttpAPI) AcknowledgeClusterRecoveries(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + var clusterName string + var err error + if params["clusterAlias"] != "" { + clusterName, err = inst.GetClusterByAlias(params["clusterAlias"]) + } else { + clusterName, err = figureClusterName(getClusterHint(params)) + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + comment := strings.TrimSpace(req.URL.Query().Get("comment")) + if comment == "" { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("No acknowledge comment given")}) + return + } + userId := getUserId(req, user) + if userId == "" { + userId = inst.GetMaintenanceOwner() + } + if orcraft.IsRaftEnabled() { + ack := logic.NewRecoveryAcknowledgement(userId, comment) + ack.ClusterName = clusterName + _, err = orcraft.PublishCommand("ack-recovery", ack) + } else { + _, err = logic.AcknowledgeClusterRecoveries(clusterName, userId, comment) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Acknowledged cluster recoveries"), Details: clusterName}) +} + +// ClusterInfo provides details of a given cluster +func (this *HttpAPI) AcknowledgeInstanceRecoveries(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + + comment := strings.TrimSpace(req.URL.Query().Get("comment")) + if comment == "" { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("No acknowledge comment given")}) + return + } + userId := getUserId(req, user) + if userId == "" { + userId = inst.GetMaintenanceOwner() + } + if orcraft.IsRaftEnabled() { + ack := logic.NewRecoveryAcknowledgement(userId, comment) + ack.Key = instanceKey + _, err = orcraft.PublishCommand("ack-recovery", ack) + } else { + _, err = logic.AcknowledgeInstanceRecoveries(&instanceKey, userId, comment) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Acknowledged instance recoveries"), Details: instanceKey}) +} + +// ClusterInfo provides details of a given cluster +func (this *HttpAPI) AcknowledgeRecovery(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + var err error + var recoveryId int64 + var idParam string + + // Ack either via id or uid + recoveryUid := params["uid"] + if recoveryUid == "" { + idParam = params["recoveryId"] + recoveryId, err = strconv.ParseInt(idParam, 10, 0) + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + } else { + idParam = recoveryUid + } + comment := strings.TrimSpace(req.URL.Query().Get("comment")) + if comment == "" { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("No acknowledge comment given")}) + return + } + userId := getUserId(req, user) + if userId == "" { + userId = inst.GetMaintenanceOwner() + } + if orcraft.IsRaftEnabled() { + ack := logic.NewRecoveryAcknowledgement(userId, comment) + ack.Id = recoveryId + ack.UID = recoveryUid + _, err = orcraft.PublishCommand("ack-recovery", ack) + } else { + if recoveryUid != "" { + _, err = logic.AcknowledgeRecoveryByUID(recoveryUid, userId, comment) + } else { + _, err = logic.AcknowledgeRecovery(recoveryId, userId, comment) + } + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Acknowledged recovery"), Details: idParam}) +} + +// ClusterInfo provides details of a given cluster +func (this *HttpAPI) AcknowledgeAllRecoveries(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + comment := strings.TrimSpace(req.URL.Query().Get("comment")) + if comment == "" { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("No acknowledge comment given")}) + return + } + userId := getUserId(req, user) + if userId == "" { + userId = inst.GetMaintenanceOwner() + } + var err error + if orcraft.IsRaftEnabled() { + ack := logic.NewRecoveryAcknowledgement(userId, comment) + ack.AllRecoveries = true + _, err = orcraft.PublishCommand("ack-recovery", ack) + } else { + _, err = logic.AcknowledgeAllRecoveries(userId, comment) + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Acknowledged all recoveries"), Details: comment}) +} + +// BlockedRecoveries reads list of currently blocked recoveries, optionally filtered by cluster name +func (this *HttpAPI) BlockedRecoveries(params martini.Params, r render.Render, req *http.Request) { + blockedRecoveries, err := logic.ReadBlockedRecoveries(params["clusterName"]) + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + r.JSON(http.StatusOK, blockedRecoveries) +} + +// DisableGlobalRecoveries globally disables recoveries +func (this *HttpAPI) DisableGlobalRecoveries(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + var err error + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("disable-global-recoveries", 0) + } else { + err = logic.DisableRecovery() + } + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Globally disabled recoveries", Details: "disabled"}) +} + +// EnableGlobalRecoveries globally enables recoveries +func (this *HttpAPI) EnableGlobalRecoveries(params martini.Params, r render.Render, req *http.Request, user auth.User) { + if !isAuthorizedForAction(req, user) { + Respond(r, &APIResponse{Code: ERROR, Message: "Unauthorized"}) + return + } + + var err error + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("enable-global-recoveries", 0) + } else { + err = logic.EnableRecovery() + } + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + Respond(r, &APIResponse{Code: OK, Message: "Globally enabled recoveries", Details: "enabled"}) +} + +// CheckGlobalRecoveries checks whether +func (this *HttpAPI) CheckGlobalRecoveries(params martini.Params, r render.Render, req *http.Request) { + isDisabled, err := logic.IsRecoveryDisabled() + + if err != nil { + Respond(r, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + details := "enabled" + if isDisabled { + details = "disabled" + } + Respond(r, &APIResponse{Code: OK, Message: fmt.Sprintf("Global recoveries %+v", details), Details: details}) +} + +func (this *HttpAPI) getSynonymPath(path string) (synonymPath string) { + pathBase := strings.Split(path, "/")[0] + if synonym, ok := apiSynonyms[pathBase]; ok { + synonymPath = fmt.Sprintf("%s%s", synonym, path[len(pathBase):]) + } + return synonymPath +} + +func (this *HttpAPI) registerSingleAPIRequest(m *martini.ClassicMartini, path string, handler martini.Handler, allowProxy bool) { + registeredPaths = append(registeredPaths, path) + fullPath := fmt.Sprintf("%s/api/%s", this.URLPrefix, path) + + if allowProxy && config.Config.RaftEnabled { + m.Get(fullPath, raftReverseProxy, handler) + } else { + m.Get(fullPath, handler) + } +} + +func (this *HttpAPI) registerAPIRequestInternal(m *martini.ClassicMartini, path string, handler martini.Handler, allowProxy bool) { + this.registerSingleAPIRequest(m, path, handler, allowProxy) + + if synonym := this.getSynonymPath(path); synonym != "" { + this.registerSingleAPIRequest(m, synonym, handler, allowProxy) + } +} + +func (this *HttpAPI) registerAPIRequest(m *martini.ClassicMartini, path string, handler martini.Handler) { + this.registerAPIRequestInternal(m, path, handler, true) +} + +func (this *HttpAPI) registerAPIRequestNoProxy(m *martini.ClassicMartini, path string, handler martini.Handler) { + this.registerAPIRequestInternal(m, path, handler, false) +} + +// RegisterRequests makes for the de-facto list of known API calls +func (this *HttpAPI) RegisterRequests(m *martini.ClassicMartini) { + // Smart relocation: + this.registerAPIRequest(m, "relocate/:host/:port/:belowHost/:belowPort", this.RelocateBelow) + this.registerAPIRequest(m, "relocate-below/:host/:port/:belowHost/:belowPort", this.RelocateBelow) + this.registerAPIRequest(m, "relocate-slaves/:host/:port/:belowHost/:belowPort", this.RelocateReplicas) + this.registerAPIRequest(m, "regroup-slaves/:host/:port", this.RegroupReplicas) + + // Classic file:pos relocation: + this.registerAPIRequest(m, "move-up/:host/:port", this.MoveUp) + this.registerAPIRequest(m, "move-up-slaves/:host/:port", this.MoveUpReplicas) + this.registerAPIRequest(m, "move-below/:host/:port/:siblingHost/:siblingPort", this.MoveBelow) + this.registerAPIRequest(m, "move-equivalent/:host/:port/:belowHost/:belowPort", this.MoveEquivalent) + this.registerAPIRequest(m, "repoint/:host/:port/:belowHost/:belowPort", this.Repoint) + this.registerAPIRequest(m, "repoint-slaves/:host/:port", this.RepointReplicas) + this.registerAPIRequest(m, "make-co-master/:host/:port", this.MakeCoMaster) + this.registerAPIRequest(m, "enslave-siblings/:host/:port", this.TakeSiblings) + this.registerAPIRequest(m, "enslave-master/:host/:port", this.TakeMaster) + this.registerAPIRequest(m, "master-equivalent/:host/:port/:logFile/:logPos", this.MasterEquivalent) + + // Binlog server relocation: + this.registerAPIRequest(m, "regroup-slaves-bls/:host/:port", this.RegroupReplicasBinlogServers) + + // GTID relocation: + this.registerAPIRequest(m, "move-below-gtid/:host/:port/:belowHost/:belowPort", this.MoveBelowGTID) + this.registerAPIRequest(m, "move-slaves-gtid/:host/:port/:belowHost/:belowPort", this.MoveReplicasGTID) + this.registerAPIRequest(m, "regroup-slaves-gtid/:host/:port", this.RegroupReplicasGTID) + + // Pseudo-GTID relocation: + this.registerAPIRequest(m, "match/:host/:port/:belowHost/:belowPort", this.MatchBelow) + this.registerAPIRequest(m, "match-below/:host/:port/:belowHost/:belowPort", this.MatchBelow) + this.registerAPIRequest(m, "match-up/:host/:port", this.MatchUp) + this.registerAPIRequest(m, "match-slaves/:host/:port/:belowHost/:belowPort", this.MultiMatchReplicas) + this.registerAPIRequest(m, "match-up-slaves/:host/:port", this.MatchUpReplicas) + this.registerAPIRequest(m, "regroup-slaves-pgtid/:host/:port", this.RegroupReplicasPseudoGTID) + // Legacy, need to revisit: + this.registerAPIRequest(m, "make-master/:host/:port", this.MakeMaster) + this.registerAPIRequest(m, "make-local-master/:host/:port", this.MakeLocalMaster) + + // Replication, general: + this.registerAPIRequest(m, "enable-gtid/:host/:port", this.EnableGTID) + this.registerAPIRequest(m, "disable-gtid/:host/:port", this.DisableGTID) + this.registerAPIRequest(m, "locate-gtid-errant/:host/:port", this.LocateErrantGTID) + this.registerAPIRequest(m, "gtid-errant-reset-master/:host/:port", this.ErrantGTIDResetMaster) + this.registerAPIRequest(m, "gtid-errant-inject-empty/:host/:port", this.ErrantGTIDInjectEmpty) + this.registerAPIRequest(m, "skip-query/:host/:port", this.SkipQuery) + this.registerAPIRequest(m, "start-slave/:host/:port", this.StartReplication) + this.registerAPIRequest(m, "restart-slave/:host/:port", this.RestartReplication) + this.registerAPIRequest(m, "stop-slave/:host/:port", this.StopReplication) + this.registerAPIRequest(m, "stop-slave-nice/:host/:port", this.StopReplicationNicely) + this.registerAPIRequest(m, "reset-slave/:host/:port", this.ResetReplication) + this.registerAPIRequest(m, "detach-slave/:host/:port", this.DetachReplicaMasterHost) + this.registerAPIRequest(m, "reattach-slave/:host/:port", this.ReattachReplicaMasterHost) + this.registerAPIRequest(m, "detach-slave-master-host/:host/:port", this.DetachReplicaMasterHost) + this.registerAPIRequest(m, "reattach-slave-master-host/:host/:port", this.ReattachReplicaMasterHost) + this.registerAPIRequest(m, "flush-binary-logs/:host/:port", this.FlushBinaryLogs) + this.registerAPIRequest(m, "purge-binary-logs/:host/:port/:logFile", this.PurgeBinaryLogs) + this.registerAPIRequest(m, "restart-slave-statements/:host/:port", this.RestartReplicationStatements) + this.registerAPIRequest(m, "enable-semi-sync-master/:host/:port", this.EnableSemiSyncMaster) + this.registerAPIRequest(m, "disable-semi-sync-master/:host/:port", this.DisableSemiSyncMaster) + this.registerAPIRequest(m, "enable-semi-sync-replica/:host/:port", this.EnableSemiSyncReplica) + this.registerAPIRequest(m, "disable-semi-sync-replica/:host/:port", this.DisableSemiSyncReplica) + + // Replication information: + this.registerAPIRequest(m, "can-replicate-from/:host/:port/:belowHost/:belowPort", this.CanReplicateFrom) + this.registerAPIRequest(m, "can-replicate-from-gtid/:host/:port/:belowHost/:belowPort", this.CanReplicateFromGTID) + + // Instance: + this.registerAPIRequest(m, "set-read-only/:host/:port", this.SetReadOnly) + this.registerAPIRequest(m, "set-writeable/:host/:port", this.SetWriteable) + this.registerAPIRequest(m, "kill-query/:host/:port/:process", this.KillQuery) + + // Binary logs: + this.registerAPIRequest(m, "last-pseudo-gtid/:host/:port", this.LastPseudoGTID) + + // Pools: + this.registerAPIRequest(m, "submit-pool-instances/:pool", this.SubmitPoolInstances) + this.registerAPIRequest(m, "cluster-pool-instances/:clusterName", this.ReadClusterPoolInstancesMap) + this.registerAPIRequest(m, "cluster-pool-instances/:clusterName/:pool", this.ReadClusterPoolInstancesMap) + this.registerAPIRequest(m, "heuristic-cluster-pool-instances/:clusterName", this.GetHeuristicClusterPoolInstances) + this.registerAPIRequest(m, "heuristic-cluster-pool-instances/:clusterName/:pool", this.GetHeuristicClusterPoolInstances) + this.registerAPIRequest(m, "heuristic-cluster-pool-lag/:clusterName", this.GetHeuristicClusterPoolInstancesLag) + this.registerAPIRequest(m, "heuristic-cluster-pool-lag/:clusterName/:pool", this.GetHeuristicClusterPoolInstancesLag) + + // Information: + this.registerAPIRequest(m, "search/:searchString", this.Search) + this.registerAPIRequest(m, "search", this.Search) + + // Cluster + this.registerAPIRequest(m, "cluster/:clusterHint", this.Cluster) + this.registerAPIRequest(m, "cluster/alias/:clusterAlias", this.ClusterByAlias) + this.registerAPIRequest(m, "cluster/instance/:host/:port", this.ClusterByInstance) + this.registerAPIRequest(m, "cluster-info/:clusterHint", this.ClusterInfo) + this.registerAPIRequest(m, "cluster-info/alias/:clusterAlias", this.ClusterInfoByAlias) + this.registerAPIRequest(m, "cluster-osc-slaves/:clusterHint", this.ClusterOSCReplicas) + this.registerAPIRequest(m, "set-cluster-alias/:clusterName", this.SetClusterAliasManualOverride) + this.registerAPIRequest(m, "clusters", this.Clusters) + this.registerAPIRequest(m, "clusters-info", this.ClustersInfo) + + this.registerAPIRequest(m, "masters", this.Masters) + this.registerAPIRequest(m, "master/:clusterHint", this.ClusterMaster) + this.registerAPIRequest(m, "instance-replicas/:host/:port", this.InstanceReplicas) + this.registerAPIRequest(m, "all-instances", this.AllInstances) + this.registerAPIRequest(m, "downtimed", this.Downtimed) + this.registerAPIRequest(m, "downtimed/:clusterHint", this.Downtimed) + this.registerAPIRequest(m, "topology/:clusterHint", this.AsciiTopology) + this.registerAPIRequest(m, "topology/:host/:port", this.AsciiTopology) + this.registerAPIRequest(m, "topology-tabulated/:clusterHint", this.AsciiTopologyTabulated) + this.registerAPIRequest(m, "topology-tabulated/:host/:port", this.AsciiTopologyTabulated) + this.registerAPIRequest(m, "topology-tags/:clusterHint", this.AsciiTopologyTags) + this.registerAPIRequest(m, "topology-tags/:host/:port", this.AsciiTopologyTags) + this.registerAPIRequest(m, "snapshot-topologies", this.SnapshotTopologies) + + // Key-value: + this.registerAPIRequest(m, "submit-masters-to-kv-stores", this.SubmitMastersToKvStores) + this.registerAPIRequest(m, "submit-masters-to-kv-stores/:clusterHint", this.SubmitMastersToKvStores) + + // Tags: + this.registerAPIRequest(m, "tagged", this.Tagged) + this.registerAPIRequest(m, "tags/:host/:port", this.Tags) + this.registerAPIRequest(m, "tag-value/:host/:port", this.TagValue) + this.registerAPIRequest(m, "tag-value/:host/:port/:tagName", this.TagValue) + this.registerAPIRequest(m, "tag/:host/:port", this.Tag) + this.registerAPIRequest(m, "tag/:host/:port/:tagName/:tagValue", this.Tag) + this.registerAPIRequest(m, "untag/:host/:port", this.Untag) + this.registerAPIRequest(m, "untag/:host/:port/:tagName", this.Untag) + this.registerAPIRequest(m, "untag-all", this.UntagAll) + this.registerAPIRequest(m, "untag-all/:tagName/:tagValue", this.UntagAll) + + // Instance management: + this.registerAPIRequest(m, "instance/:host/:port", this.Instance) + this.registerAPIRequest(m, "discover/:host/:port", this.Discover) + this.registerAPIRequest(m, "async-discover/:host/:port", this.AsyncDiscover) + this.registerAPIRequest(m, "refresh/:host/:port", this.Refresh) + this.registerAPIRequest(m, "forget/:host/:port", this.Forget) + this.registerAPIRequest(m, "forget-cluster/:clusterHint", this.ForgetCluster) + this.registerAPIRequest(m, "begin-maintenance/:host/:port/:owner/:reason", this.BeginMaintenance) + this.registerAPIRequest(m, "end-maintenance/:host/:port", this.EndMaintenanceByInstanceKey) + this.registerAPIRequest(m, "in-maintenance/:host/:port", this.InMaintenance) + this.registerAPIRequest(m, "end-maintenance/:maintenanceKey", this.EndMaintenance) + this.registerAPIRequest(m, "maintenance", this.Maintenance) + this.registerAPIRequest(m, "begin-downtime/:host/:port/:owner/:reason", this.BeginDowntime) + this.registerAPIRequest(m, "begin-downtime/:host/:port/:owner/:reason/:duration", this.BeginDowntime) + this.registerAPIRequest(m, "end-downtime/:host/:port", this.EndDowntime) + + // Recovery: + this.registerAPIRequest(m, "replication-analysis", this.ReplicationAnalysis) + this.registerAPIRequest(m, "replication-analysis/:clusterName", this.ReplicationAnalysisForCluster) + this.registerAPIRequest(m, "replication-analysis/instance/:host/:port", this.ReplicationAnalysisForKey) + this.registerAPIRequest(m, "recover/:host/:port", this.Recover) + this.registerAPIRequest(m, "recover/:host/:port/:candidateHost/:candidatePort", this.Recover) + this.registerAPIRequest(m, "recover-lite/:host/:port", this.RecoverLite) + this.registerAPIRequest(m, "recover-lite/:host/:port/:candidateHost/:candidatePort", this.RecoverLite) + this.registerAPIRequest(m, "graceful-master-takeover/:host/:port", this.GracefulMasterTakeover) + this.registerAPIRequest(m, "graceful-master-takeover/:host/:port/:designatedHost/:designatedPort", this.GracefulMasterTakeover) + this.registerAPIRequest(m, "graceful-master-takeover/:clusterHint", this.GracefulMasterTakeover) + this.registerAPIRequest(m, "graceful-master-takeover/:clusterHint/:designatedHost/:designatedPort", this.GracefulMasterTakeover) + this.registerAPIRequest(m, "graceful-master-takeover-auto/:host/:port", this.GracefulMasterTakeoverAuto) + this.registerAPIRequest(m, "graceful-master-takeover-auto/:host/:port/:designatedHost/:designatedPort", this.GracefulMasterTakeoverAuto) + this.registerAPIRequest(m, "graceful-master-takeover-auto/:clusterHint", this.GracefulMasterTakeoverAuto) + this.registerAPIRequest(m, "graceful-master-takeover-auto/:clusterHint/:designatedHost/:designatedPort", this.GracefulMasterTakeoverAuto) + this.registerAPIRequest(m, "force-master-failover/:host/:port", this.ForceMasterFailover) + this.registerAPIRequest(m, "force-master-failover/:clusterHint", this.ForceMasterFailover) + this.registerAPIRequest(m, "force-master-takeover/:clusterHint/:designatedHost/:designatedPort", this.ForceMasterTakeover) + this.registerAPIRequest(m, "force-master-takeover/:host/:port/:designatedHost/:designatedPort", this.ForceMasterTakeover) + this.registerAPIRequest(m, "register-candidate/:host/:port/:promotionRule", this.RegisterCandidate) + this.registerAPIRequest(m, "automated-recovery-filters", this.AutomatedRecoveryFilters) + this.registerAPIRequest(m, "audit-failure-detection", this.AuditFailureDetection) + this.registerAPIRequest(m, "audit-failure-detection/:page", this.AuditFailureDetection) + this.registerAPIRequest(m, "audit-failure-detection/id/:id", this.AuditFailureDetection) + this.registerAPIRequest(m, "audit-failure-detection/alias/:clusterAlias", this.AuditFailureDetection) + this.registerAPIRequest(m, "audit-failure-detection/alias/:clusterAlias/:page", this.AuditFailureDetection) + this.registerAPIRequest(m, "replication-analysis-changelog", this.ReadReplicationAnalysisChangelog) + this.registerAPIRequest(m, "audit-recovery", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/:page", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/id/:id", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/uid/:uid", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/cluster/:clusterName", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/cluster/:clusterName/:page", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/alias/:clusterAlias", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery/alias/:clusterAlias/:page", this.AuditRecovery) + this.registerAPIRequest(m, "audit-recovery-steps/:uid", this.AuditRecoverySteps) + this.registerAPIRequest(m, "active-cluster-recovery/:clusterName", this.ActiveClusterRecovery) + this.registerAPIRequest(m, "recently-active-cluster-recovery/:clusterName", this.RecentlyActiveClusterRecovery) + this.registerAPIRequest(m, "recently-active-instance-recovery/:host/:port", this.RecentlyActiveInstanceRecovery) + this.registerAPIRequest(m, "ack-recovery/cluster/:clusterHint", this.AcknowledgeClusterRecoveries) + this.registerAPIRequest(m, "ack-recovery/cluster/alias/:clusterAlias", this.AcknowledgeClusterRecoveries) + this.registerAPIRequest(m, "ack-recovery/instance/:host/:port", this.AcknowledgeInstanceRecoveries) + this.registerAPIRequest(m, "ack-recovery/:recoveryId", this.AcknowledgeRecovery) + this.registerAPIRequest(m, "ack-recovery/uid/:uid", this.AcknowledgeRecovery) + this.registerAPIRequest(m, "ack-all-recoveries", this.AcknowledgeAllRecoveries) + this.registerAPIRequest(m, "blocked-recoveries", this.BlockedRecoveries) + this.registerAPIRequest(m, "blocked-recoveries/cluster/:clusterName", this.BlockedRecoveries) + this.registerAPIRequest(m, "disable-global-recoveries", this.DisableGlobalRecoveries) + this.registerAPIRequest(m, "enable-global-recoveries", this.EnableGlobalRecoveries) + this.registerAPIRequest(m, "check-global-recoveries", this.CheckGlobalRecoveries) + + // General + this.registerAPIRequest(m, "problems", this.Problems) + this.registerAPIRequest(m, "problems/:clusterName", this.Problems) + this.registerAPIRequest(m, "audit", this.Audit) + this.registerAPIRequest(m, "audit/:page", this.Audit) + this.registerAPIRequest(m, "audit/instance/:host/:port", this.Audit) + this.registerAPIRequest(m, "audit/instance/:host/:port/:page", this.Audit) + this.registerAPIRequest(m, "resolve/:host/:port", this.Resolve) + + // Meta, no proxy + this.registerAPIRequestNoProxy(m, "headers", this.Headers) + this.registerAPIRequestNoProxy(m, "health", this.Health) + this.registerAPIRequestNoProxy(m, "lb-check", this.LBCheck) + this.registerAPIRequestNoProxy(m, "_ping", this.LBCheck) + this.registerAPIRequestNoProxy(m, "leader-check", this.LeaderCheck) + this.registerAPIRequestNoProxy(m, "leader-check/:errorStatusCode", this.LeaderCheck) + this.registerAPIRequestNoProxy(m, "grab-election", this.GrabElection) + this.registerAPIRequest(m, "raft-add-peer/:addr", this.RaftAddPeer) // delegated to the raft leader + this.registerAPIRequest(m, "raft-remove-peer/:addr", this.RaftRemovePeer) // delegated to the raft leader + this.registerAPIRequestNoProxy(m, "raft-yield/:node", this.RaftYield) + this.registerAPIRequestNoProxy(m, "raft-yield-hint/:hint", this.RaftYieldHint) + this.registerAPIRequestNoProxy(m, "raft-peers", this.RaftPeers) + this.registerAPIRequestNoProxy(m, "raft-state", this.RaftState) + this.registerAPIRequestNoProxy(m, "raft-leader", this.RaftLeader) + this.registerAPIRequestNoProxy(m, "raft-health", this.RaftHealth) + this.registerAPIRequestNoProxy(m, "raft-status", this.RaftStatus) + this.registerAPIRequestNoProxy(m, "raft-snapshot", this.RaftSnapshot) + this.registerAPIRequestNoProxy(m, "raft-follower-health-report/:authenticationToken/:raftBind/:raftAdvertise", this.RaftFollowerHealthReport) + this.registerAPIRequestNoProxy(m, "reload-configuration", this.ReloadConfiguration) + this.registerAPIRequestNoProxy(m, "hostname-resolve-cache", this.HostnameResolveCache) + this.registerAPIRequestNoProxy(m, "reset-hostname-resolve-cache", this.ResetHostnameResolveCache) + // Meta + this.registerAPIRequest(m, "routed-leader-check", this.LeaderCheck) + this.registerAPIRequest(m, "reelect", this.Reelect) + this.registerAPIRequest(m, "reload-cluster-alias", this.ReloadClusterAlias) + this.registerAPIRequest(m, "deregister-hostname-unresolve/:host/:port", this.DeregisterHostnameUnresolve) + this.registerAPIRequest(m, "register-hostname-unresolve/:host/:port/:virtualname", this.RegisterHostnameUnresolve) + + // Bulk access to information + this.registerAPIRequest(m, "bulk-instances", this.BulkInstances) + this.registerAPIRequest(m, "bulk-promotion-rules", this.BulkPromotionRules) + + // Monitoring + this.registerAPIRequest(m, "discovery-metrics-raw/:seconds", this.DiscoveryMetricsRaw) + this.registerAPIRequest(m, "discovery-metrics-aggregated/:seconds", this.DiscoveryMetricsAggregated) + this.registerAPIRequest(m, "discovery-queue-metrics-raw/:seconds", this.DiscoveryQueueMetricsRaw) + this.registerAPIRequest(m, "discovery-queue-metrics-aggregated/:seconds", this.DiscoveryQueueMetricsAggregated) + this.registerAPIRequest(m, "backend-query-metrics-raw/:seconds", this.BackendQueryMetricsRaw) + this.registerAPIRequest(m, "backend-query-metrics-aggregated/:seconds", this.BackendQueryMetricsAggregated) + this.registerAPIRequest(m, "write-buffer-metrics-raw/:seconds", this.WriteBufferMetricsRaw) + this.registerAPIRequest(m, "write-buffer-metrics-aggregated/:seconds", this.WriteBufferMetricsAggregated) + + // Agents + this.registerAPIRequest(m, "agents", this.Agents) + this.registerAPIRequest(m, "agent/:host", this.Agent) + this.registerAPIRequest(m, "agent-umount/:host", this.AgentUnmount) + this.registerAPIRequest(m, "agent-mount/:host", this.AgentMountLV) + this.registerAPIRequest(m, "agent-create-snapshot/:host", this.AgentCreateSnapshot) + this.registerAPIRequest(m, "agent-removelv/:host", this.AgentRemoveLV) + this.registerAPIRequest(m, "agent-mysql-stop/:host", this.AgentMySQLStop) + this.registerAPIRequest(m, "agent-mysql-start/:host", this.AgentMySQLStart) + this.registerAPIRequest(m, "agent-seed/:targetHost/:sourceHost", this.AgentSeed) + this.registerAPIRequest(m, "agent-active-seeds/:host", this.AgentActiveSeeds) + this.registerAPIRequest(m, "agent-recent-seeds/:host", this.AgentRecentSeeds) + this.registerAPIRequest(m, "agent-seed-details/:seedId", this.AgentSeedDetails) + this.registerAPIRequest(m, "agent-seed-states/:seedId", this.AgentSeedStates) + this.registerAPIRequest(m, "agent-abort-seed/:seedId", this.AbortSeed) + this.registerAPIRequest(m, "agent-custom-command/:host/:command", this.AgentCustomCommand) + this.registerAPIRequest(m, "seeds", this.Seeds) + + // Configurable status check endpoint + if config.Config.StatusEndpoint == config.DefaultStatusAPIEndpoint { + this.registerAPIRequestNoProxy(m, "status", this.StatusCheck) + } else { + m.Get(config.Config.StatusEndpoint, this.StatusCheck) + } +} diff --git a/go/vt/orchestrator/http/api_test.go b/go/vt/orchestrator/http/api_test.go new file mode 100644 index 0000000000..3070fbec93 --- /dev/null +++ b/go/vt/orchestrator/http/api_test.go @@ -0,0 +1,55 @@ +package http + +import ( + "strings" + "testing" + + "github.com/go-martini/martini" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestGetSynonymPath(t *testing.T) { + api := HttpAPI{} + + { + path := "relocate-slaves" + synonym := api.getSynonymPath(path) + test.S(t).ExpectEquals(synonym, "relocate-replicas") + } + { + path := "relocate-slaves/:host/:port" + synonym := api.getSynonymPath(path) + test.S(t).ExpectEquals(synonym, "relocate-replicas/:host/:port") + } +} + +func TestKnownPaths(t *testing.T) { + m := martini.Classic() + api := HttpAPI{} + + api.RegisterRequests(m) + + pathsMap := make(map[string]bool) + for _, path := range registeredPaths { + pathBase := strings.Split(path, "/")[0] + pathsMap[pathBase] = true + } + test.S(t).ExpectTrue(pathsMap["health"]) + test.S(t).ExpectTrue(pathsMap["lb-check"]) + test.S(t).ExpectTrue(pathsMap["relocate"]) + test.S(t).ExpectTrue(pathsMap["relocate-slaves"]) + + for path, synonym := range apiSynonyms { + test.S(t).ExpectTrue(pathsMap[path]) + test.S(t).ExpectTrue(pathsMap[synonym]) + } +} diff --git a/go/vt/orchestrator/http/httpbase.go b/go/vt/orchestrator/http/httpbase.go new file mode 100644 index 0000000000..f2106561df --- /dev/null +++ b/go/vt/orchestrator/http/httpbase.go @@ -0,0 +1,176 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package http + +import ( + "fmt" + "net/http" + "strings" + + "github.com/martini-contrib/auth" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/os" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" +) + +func getProxyAuthUser(req *http.Request) string { + for _, user := range req.Header[config.Config.AuthUserHeader] { + return user + } + return "" +} + +// isAuthorizedForAction checks req to see whether authenticated user has write-privileges. +// This depends on configured authentication method. +func isAuthorizedForAction(req *http.Request, user auth.User) bool { + if config.Config.ReadOnly { + return false + } + + if orcraft.IsRaftEnabled() && !orcraft.IsLeader() { + // A raft member that is not a leader is unauthorized. + return false + } + + switch strings.ToLower(config.Config.AuthenticationMethod) { + case "basic": + { + // The mere fact we're here means the user has passed authentication + return true + } + case "multi": + { + if string(user) == "readonly" { + // read only + return false + } + // passed authentication ==> writeable + return true + } + case "proxy": + { + authUser := getProxyAuthUser(req) + for _, configPowerAuthUser := range config.Config.PowerAuthUsers { + if configPowerAuthUser == "*" || configPowerAuthUser == authUser { + return true + } + } + // check the user's group is one of those listed here + if len(config.Config.PowerAuthGroups) > 0 && os.UserInGroups(authUser, config.Config.PowerAuthGroups) { + return true + } + return false + } + case "token": + { + cookie, err := req.Cookie("access-token") + if err != nil { + return false + } + + publicToken := strings.Split(cookie.Value, ":")[0] + secretToken := strings.Split(cookie.Value, ":")[1] + result, _ := process.TokenIsValid(publicToken, secretToken) + return result + } + case "oauth": + { + return false + } + default: + { + // Default: no authentication method + return true + } + } +} + +func authenticateToken(publicToken string, resp http.ResponseWriter) error { + secretToken, err := process.AcquireAccessToken(publicToken) + if err != nil { + return err + } + cookieValue := fmt.Sprintf("%s:%s", publicToken, secretToken) + cookie := &http.Cookie{Name: "access-token", Value: cookieValue, Path: "/"} + http.SetCookie(resp, cookie) + return nil +} + +// getUserId returns the authenticated user id, if available, depending on authertication method. +func getUserId(req *http.Request, user auth.User) string { + if config.Config.ReadOnly { + return "" + } + + switch strings.ToLower(config.Config.AuthenticationMethod) { + case "basic": + { + return string(user) + } + case "multi": + { + return string(user) + } + case "proxy": + { + return getProxyAuthUser(req) + } + case "token": + { + return "" + } + default: + { + return "" + } + } +} + +func getClusterHint(params map[string]string) string { + if params["clusterHint"] != "" { + return params["clusterHint"] + } + if params["clusterName"] != "" { + return params["clusterName"] + } + if params["host"] != "" && params["port"] != "" { + return fmt.Sprintf("%s:%s", params["host"], params["port"]) + } + return "" +} + +// figureClusterName is a convenience function to get a cluster name from hints +func figureClusterName(hint string) (clusterName string, err error) { + if hint == "" { + return "", fmt.Errorf("Unable to determine cluster name by empty hint") + } + instanceKey, _ := inst.ParseRawInstanceKey(hint) + return inst.FigureClusterName(hint, instanceKey, nil) +} + +// getClusterNameIfExists returns a cluster name by params hint, or an empty cluster name +// if no hint is given +func getClusterNameIfExists(params map[string]string) (clusterName string, err error) { + if clusterHint := getClusterHint(params); clusterHint == "" { + return "", nil + } else { + return figureClusterName(clusterHint) + } +} diff --git a/go/vt/orchestrator/http/raft_reverse_proxy.go b/go/vt/orchestrator/http/raft_reverse_proxy.go new file mode 100644 index 0000000000..59cefe6e07 --- /dev/null +++ b/go/vt/orchestrator/http/raft_reverse_proxy.go @@ -0,0 +1,48 @@ +package http + +import ( + "net/http" + "net/http/httputil" + "net/url" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + + "github.com/go-martini/martini" + "vitess.io/vitess/go/vt/orchestrator/config" +) + +func raftReverseProxy(w http.ResponseWriter, r *http.Request, c martini.Context) { + if !orcraft.IsRaftEnabled() { + // No raft, so no reverse proxy to the leader + return + } + if orcraft.IsLeader() { + // I am the leader. I will handle the request directly. + return + } + if orcraft.GetLeader() == "" { + return + } + if orcraft.LeaderURI.IsThisLeaderURI() { + // Although I'm not the leader, the value I see for LeaderURI is my own. + // I'm probably not up-to-date with my raft transaction log and don't have the latest information. + // But anyway, obviously not going to redirect to myself. + // Gonna return: this isn't ideal, because I'm not really the leader. If the user tries to + // run an operation they'll fail. + return + } + url, err := url.Parse(orcraft.LeaderURI.Get()) + if err != nil { + log.Errore(err) + return + } + r.Header.Del("Accept-Encoding") + switch strings.ToLower(config.Config.AuthenticationMethod) { + case "basic", "multi": + r.SetBasicAuth(config.Config.HTTPAuthUser, config.Config.HTTPAuthPassword) + } + proxy := httputil.NewSingleHostReverseProxy(url) + proxy.ServeHTTP(w, r) +} diff --git a/go/vt/orchestrator/http/web.go b/go/vt/orchestrator/http/web.go new file mode 100644 index 0000000000..1dbc1396f6 --- /dev/null +++ b/go/vt/orchestrator/http/web.go @@ -0,0 +1,476 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package http + +import ( + "expvar" + "fmt" + "net/http" + "net/http/pprof" + "strconv" + "text/template" + + "github.com/go-martini/martini" + "github.com/martini-contrib/auth" + "github.com/martini-contrib/render" + "github.com/rcrowley/go-metrics" + "github.com/rcrowley/go-metrics/exp" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/inst" +) + +// HttpWeb is the web requests server, mapping each request to a web page +type HttpWeb struct { + URLPrefix string +} + +var Web HttpWeb = HttpWeb{} + +func (this *HttpWeb) getInstanceKey(host string, port string) (inst.InstanceKey, error) { + instanceKey := inst.InstanceKey{Hostname: host} + var err error + + if instanceKey.Port, err = strconv.Atoi(port); err != nil { + return instanceKey, fmt.Errorf("Invalid port: %s", port) + } + return instanceKey, err +} + +func (this *HttpWeb) AccessToken(params martini.Params, r render.Render, req *http.Request, resp http.ResponseWriter, user auth.User) { + publicToken := template.JSEscapeString(req.URL.Query().Get("publicToken")) + err := authenticateToken(publicToken, resp) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + r.Redirect(this.URLPrefix + "/") +} + +func (this *HttpWeb) Index(params martini.Params, r render.Render, req *http.Request, user auth.User) { + // Redirect index so that all web URLs begin with "/web/". + // We also redirect /web/ to /web/clusters so that + // the Clusters page has a single canonical URL. + r.Redirect(this.URLPrefix + "/web/clusters") +} + +func (this *HttpWeb) Clusters(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/clusters", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "clusters", + "autoshow_problems": false, + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "removeTextFromHostnameDisplay": config.Config.RemoveTextFromHostnameDisplay, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) ClustersAnalysis(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/clusters_analysis", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "clusters", + "autoshow_problems": false, + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "removeTextFromHostnameDisplay": config.Config.RemoveTextFromHostnameDisplay, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Cluster(params martini.Params, r render.Render, req *http.Request, user auth.User) { + clusterName, _ := figureClusterName(params["clusterName"]) + + r.HTML(200, "templates/cluster", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "cluster", + "clusterName": clusterName, + "autoshow_problems": true, + "contextMenuVisible": true, + "pseudoGTIDModeEnabled": (config.Config.PseudoGTIDPattern != ""), + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "removeTextFromHostnameDisplay": config.Config.RemoveTextFromHostnameDisplay, + "compactDisplay": template.JSEscapeString(req.URL.Query().Get("compact")), + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) ClusterByAlias(params martini.Params, r render.Render, req *http.Request, user auth.User) { + clusterName, err := inst.GetClusterByAlias(params["clusterAlias"]) + // Willing to accept the case of multiple clusters; we just present one + if clusterName == "" && err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + params["clusterName"] = clusterName + this.Cluster(params, r, req, user) +} + +func (this *HttpWeb) ClusterByInstance(params martini.Params, r render.Render, req *http.Request, user auth.User) { + instanceKey, err := this.getInstanceKey(params["host"], params["port"]) + if err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: err.Error()}) + return + } + instance, found, err := inst.ReadInstance(&instanceKey) + if (!found) || (err != nil) { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("Cannot read instance: %+v", instanceKey)}) + return + } + + // Willing to accept the case of multiple clusters; we just present one + if instance.ClusterName == "" && err != nil { + r.JSON(200, &APIResponse{Code: ERROR, Message: fmt.Sprintf("%+v", err)}) + return + } + + params["clusterName"] = instance.ClusterName + this.Cluster(params, r, req, user) +} + +func (this *HttpWeb) ClusterPools(params martini.Params, r render.Render, req *http.Request, user auth.User) { + clusterName, _ := figureClusterName(params["clusterName"]) + r.HTML(200, "templates/cluster_pools", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "cluster pools", + "clusterName": clusterName, + "autoshow_problems": false, // because pool screen by default expands all hosts + "contextMenuVisible": true, + "pseudoGTIDModeEnabled": (config.Config.PseudoGTIDPattern != ""), + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "removeTextFromHostnameDisplay": config.Config.RemoveTextFromHostnameDisplay, + "compactDisplay": template.JSEscapeString(req.URL.Query().Get("compact")), + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Search(params martini.Params, r render.Render, req *http.Request, user auth.User) { + searchString := params["searchString"] + if searchString == "" { + searchString = req.URL.Query().Get("s") + } + searchString = template.JSEscapeString(searchString) + r.HTML(200, "templates/search", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "search", + "searchString": searchString, + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Discover(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/discover", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "discover", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Audit(params martini.Params, r render.Render, req *http.Request, user auth.User) { + page, err := strconv.Atoi(params["page"]) + if err != nil { + page = 0 + } + + r.HTML(200, "templates/audit", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "audit", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "page": page, + "auditHostname": params["host"], + "auditPort": params["port"], + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) AuditRecovery(params martini.Params, r render.Render, req *http.Request, user auth.User) { + page, err := strconv.Atoi(params["page"]) + if err != nil { + page = 0 + } + recoveryId, err := strconv.ParseInt(params["id"], 10, 0) + if err != nil { + recoveryId = 0 + } + recoveryUid := params["uid"] + clusterAlias := params["clusterAlias"] + + clusterName, _ := figureClusterName(params["clusterName"]) + r.HTML(200, "templates/audit_recovery", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "audit-recovery", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "page": page, + "clusterName": clusterName, + "clusterAlias": clusterAlias, + "recoveryId": recoveryId, + "recoveryUid": recoveryUid, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) AuditFailureDetection(params martini.Params, r render.Render, req *http.Request, user auth.User) { + page, err := strconv.Atoi(params["page"]) + if err != nil { + page = 0 + } + detectionId, err := strconv.ParseInt(params["id"], 10, 0) + if err != nil { + detectionId = 0 + } + clusterAlias := params["clusterAlias"] + + r.HTML(200, "templates/audit_failure_detection", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "audit-failure-detection", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "page": page, + "detectionId": detectionId, + "clusterAlias": clusterAlias, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Agents(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/agents", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "agents", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Agent(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/agent", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "agent", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "agentHost": params["host"], + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) AgentSeedDetails(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/agent_seed_details", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "agent seed details", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "seedId": params["seedId"], + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Seeds(params martini.Params, r render.Render, req *http.Request, user auth.User) { + r.HTML(200, "templates/seeds", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "seeds", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Home(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/home", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "home", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) About(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/about", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "about", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) KeepCalm(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/keep-calm", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "Keep Calm", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) FAQ(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/faq", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "FAQ", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) Status(params martini.Params, r render.Render, req *http.Request, user auth.User) { + + r.HTML(200, "templates/status", map[string]interface{}{ + "agentsHttpActive": config.Config.ServeAgentsHttp, + "title": "status", + "authorizedForAction": isAuthorizedForAction(req, user), + "userId": getUserId(req, user), + "autoshow_problems": false, + "prefix": this.URLPrefix, + "webMessage": config.Config.WebMessage, + }) +} + +func (this *HttpWeb) registerWebRequest(m *martini.ClassicMartini, path string, handler martini.Handler) { + fullPath := fmt.Sprintf("%s/web/%s", this.URLPrefix, path) + if path == "/" { + fullPath = fmt.Sprintf("%s/", this.URLPrefix) + } + + if config.Config.RaftEnabled { + m.Get(fullPath, raftReverseProxy, handler) + } else { + m.Get(fullPath, handler) + } +} + +// RegisterRequests makes for the de-facto list of known Web calls +func (this *HttpWeb) RegisterRequests(m *martini.ClassicMartini) { + this.registerWebRequest(m, "access-token", this.AccessToken) + this.registerWebRequest(m, "", this.Index) + this.registerWebRequest(m, "/", this.Index) + this.registerWebRequest(m, "home", this.About) + this.registerWebRequest(m, "about", this.About) + this.registerWebRequest(m, "keep-calm", this.KeepCalm) + this.registerWebRequest(m, "faq", this.FAQ) + this.registerWebRequest(m, "status", this.Status) + this.registerWebRequest(m, "clusters", this.Clusters) + this.registerWebRequest(m, "clusters-analysis", this.ClustersAnalysis) + this.registerWebRequest(m, "cluster/:clusterName", this.Cluster) + this.registerWebRequest(m, "cluster/alias/:clusterAlias", this.ClusterByAlias) + this.registerWebRequest(m, "cluster/instance/:host/:port", this.ClusterByInstance) + this.registerWebRequest(m, "cluster-pools/:clusterName", this.ClusterPools) + this.registerWebRequest(m, "search/:searchString", this.Search) + this.registerWebRequest(m, "search", this.Search) + this.registerWebRequest(m, "discover", this.Discover) + this.registerWebRequest(m, "audit", this.Audit) + this.registerWebRequest(m, "audit/:page", this.Audit) + this.registerWebRequest(m, "audit/instance/:host/:port", this.Audit) + this.registerWebRequest(m, "audit/instance/:host/:port/:page", this.Audit) + this.registerWebRequest(m, "audit-recovery", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/:page", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/id/:id", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/uid/:uid", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/cluster/:clusterName", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/cluster/:clusterName/:page", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/alias/:clusterAlias", this.AuditRecovery) + this.registerWebRequest(m, "audit-recovery/alias/:clusterAlias/:page", this.AuditRecovery) + this.registerWebRequest(m, "audit-failure-detection", this.AuditFailureDetection) + this.registerWebRequest(m, "audit-failure-detection/:page", this.AuditFailureDetection) + this.registerWebRequest(m, "audit-failure-detection/id/:id", this.AuditFailureDetection) + this.registerWebRequest(m, "audit-failure-detection/alias/:clusterAlias", this.AuditFailureDetection) + this.registerWebRequest(m, "audit-failure-detection/alias/:clusterAlias/:page", this.AuditFailureDetection) + this.registerWebRequest(m, "audit-recovery-steps/:uid", this.AuditRecovery) + this.registerWebRequest(m, "agents", this.Agents) + this.registerWebRequest(m, "agent/:host", this.Agent) + this.registerWebRequest(m, "seed-details/:seedId", this.AgentSeedDetails) + this.registerWebRequest(m, "seeds", this.Seeds) + + this.RegisterDebug(m) +} + +// RegisterDebug adds handlers for /debug/vars (expvar) and /debug/pprof (net/http/pprof) support +func (this *HttpWeb) RegisterDebug(m *martini.ClassicMartini) { + m.Get(this.URLPrefix+"/debug/vars", func(w http.ResponseWriter, r *http.Request) { + // from expvar.go, since the expvarHandler isn't exported :( + w.Header().Set("Content-Type", "application/json; charset=utf-8") + fmt.Fprintf(w, "{\n") + first := true + expvar.Do(func(kv expvar.KeyValue) { + if !first { + fmt.Fprintf(w, ",\n") + } + first = false + fmt.Fprintf(w, "%q: %s", kv.Key, kv.Value) + }) + fmt.Fprintf(w, "\n}\n") + }) + + // list all the /debug/ endpoints we want + m.Get(this.URLPrefix+"/debug/pprof", pprof.Index) + m.Get(this.URLPrefix+"/debug/pprof/cmdline", pprof.Cmdline) + m.Get(this.URLPrefix+"/debug/pprof/profile", pprof.Profile) + m.Get(this.URLPrefix+"/debug/pprof/symbol", pprof.Symbol) + m.Post(this.URLPrefix+"/debug/pprof/symbol", pprof.Symbol) + m.Get(this.URLPrefix+"/debug/pprof/block", pprof.Handler("block").ServeHTTP) + m.Get(this.URLPrefix+"/debug/pprof/heap", pprof.Handler("heap").ServeHTTP) + m.Get(this.URLPrefix+"/debug/pprof/goroutine", pprof.Handler("goroutine").ServeHTTP) + m.Get(this.URLPrefix+"/debug/pprof/threadcreate", pprof.Handler("threadcreate").ServeHTTP) + + // go-metrics + m.Get(this.URLPrefix+"/debug/metrics", exp.ExpHandler(metrics.DefaultRegistry)) +} diff --git a/go/vt/orchestrator/inst/analysis.go b/go/vt/orchestrator/inst/analysis.go new file mode 100644 index 0000000000..8ebe39ee17 --- /dev/null +++ b/go/vt/orchestrator/inst/analysis.go @@ -0,0 +1,226 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "encoding/json" + "fmt" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/config" +) + +type AnalysisCode string +type StructureAnalysisCode string + +const ( + NoProblem AnalysisCode = "NoProblem" + DeadMasterWithoutReplicas = "DeadMasterWithoutReplicas" + DeadMaster = "DeadMaster" + DeadMasterAndReplicas = "DeadMasterAndReplicas" + DeadMasterAndSomeReplicas = "DeadMasterAndSomeReplicas" + UnreachableMasterWithLaggingReplicas = "UnreachableMasterWithLaggingReplicas" + UnreachableMaster = "UnreachableMaster" + MasterSingleReplicaNotReplicating = "MasterSingleReplicaNotReplicating" + MasterSingleReplicaDead = "MasterSingleReplicaDead" + AllMasterReplicasNotReplicating = "AllMasterReplicasNotReplicating" + AllMasterReplicasNotReplicatingOrDead = "AllMasterReplicasNotReplicatingOrDead" + LockedSemiSyncMasterHypothesis = "LockedSemiSyncMasterHypothesis" + LockedSemiSyncMaster = "LockedSemiSyncMaster" + MasterWithoutReplicas = "MasterWithoutReplicas" + DeadCoMaster = "DeadCoMaster" + DeadCoMasterAndSomeReplicas = "DeadCoMasterAndSomeReplicas" + UnreachableCoMaster = "UnreachableCoMaster" + AllCoMasterReplicasNotReplicating = "AllCoMasterReplicasNotReplicating" + DeadIntermediateMaster = "DeadIntermediateMaster" + DeadIntermediateMasterWithSingleReplica = "DeadIntermediateMasterWithSingleReplica" + DeadIntermediateMasterWithSingleReplicaFailingToConnect = "DeadIntermediateMasterWithSingleReplicaFailingToConnect" + DeadIntermediateMasterAndSomeReplicas = "DeadIntermediateMasterAndSomeReplicas" + DeadIntermediateMasterAndReplicas = "DeadIntermediateMasterAndReplicas" + UnreachableIntermediateMasterWithLaggingReplicas = "UnreachableIntermediateMasterWithLaggingReplicas" + UnreachableIntermediateMaster = "UnreachableIntermediateMaster" + AllIntermediateMasterReplicasFailingToConnectOrDead = "AllIntermediateMasterReplicasFailingToConnectOrDead" + AllIntermediateMasterReplicasNotReplicating = "AllIntermediateMasterReplicasNotReplicating" + FirstTierReplicaFailingToConnectToMaster = "FirstTierReplicaFailingToConnectToMaster" + BinlogServerFailingToConnectToMaster = "BinlogServerFailingToConnectToMaster" +) + +const ( + StatementAndMixedLoggingReplicasStructureWarning StructureAnalysisCode = "StatementAndMixedLoggingReplicasStructureWarning" + StatementAndRowLoggingReplicasStructureWarning = "StatementAndRowLoggingReplicasStructureWarning" + MixedAndRowLoggingReplicasStructureWarning = "MixedAndRowLoggingReplicasStructureWarning" + MultipleMajorVersionsLoggingReplicasStructureWarning = "MultipleMajorVersionsLoggingReplicasStructureWarning" + NoLoggingReplicasStructureWarning = "NoLoggingReplicasStructureWarning" + DifferentGTIDModesStructureWarning = "DifferentGTIDModesStructureWarning" + ErrantGTIDStructureWarning = "ErrantGTIDStructureWarning" + NoFailoverSupportStructureWarning = "NoFailoverSupportStructureWarning" + NoWriteableMasterStructureWarning = "NoWriteableMasterStructureWarning" + NotEnoughValidSemiSyncReplicasStructureWarning = "NotEnoughValidSemiSyncReplicasStructureWarning" +) + +type InstanceAnalysis struct { + key *InstanceKey + analysis AnalysisCode +} + +func NewInstanceAnalysis(instanceKey *InstanceKey, analysis AnalysisCode) *InstanceAnalysis { + return &InstanceAnalysis{ + key: instanceKey, + analysis: analysis, + } +} + +func (instanceAnalysis *InstanceAnalysis) String() string { + return fmt.Sprintf("%s/%s", instanceAnalysis.key.StringCode(), string(instanceAnalysis.analysis)) +} + +// PeerAnalysisMap indicates the number of peers agreeing on an analysis. +// Key of this map is a InstanceAnalysis.String() +type PeerAnalysisMap map[string]int + +type ReplicationAnalysisHints struct { + IncludeDowntimed bool + IncludeNoProblem bool + AuditAnalysis bool +} + +const ( + ForceMasterFailoverCommandHint string = "force-master-failover" + ForceMasterTakeoverCommandHint string = "force-master-takeover" + GracefulMasterTakeoverCommandHint string = "graceful-master-takeover" +) + +type AnalysisInstanceType string + +const ( + AnalysisInstanceTypeMaster AnalysisInstanceType = "master" + AnalysisInstanceTypeCoMaster AnalysisInstanceType = "co-master" + AnalysisInstanceTypeIntermediateMaster AnalysisInstanceType = "intermediate-master" +) + +// ReplicationAnalysis notes analysis on replication chain status, per instance +type ReplicationAnalysis struct { + AnalyzedInstanceKey InstanceKey + AnalyzedInstanceMasterKey InstanceKey + ClusterDetails ClusterInfo + AnalyzedInstanceDataCenter string + AnalyzedInstanceRegion string + AnalyzedInstancePhysicalEnvironment string + AnalyzedInstanceBinlogCoordinates BinlogCoordinates + IsMaster bool + IsCoMaster bool + LastCheckValid bool + LastCheckPartialSuccess bool + CountReplicas uint + CountValidReplicas uint + CountValidReplicatingReplicas uint + CountReplicasFailingToConnectToMaster uint + CountDowntimedReplicas uint + ReplicationDepth uint + Replicas InstanceKeyMap + SlaveHosts InstanceKeyMap // for backwards compatibility. Equals `Replicas` + IsFailingToConnectToMaster bool + Analysis AnalysisCode + Description string + StructureAnalysis []StructureAnalysisCode + IsDowntimed bool + IsReplicasDowntimed bool // as good as downtimed because all replicas are downtimed AND analysis is all about the replicas (e.e. AllMasterReplicasNotReplicating) + DowntimeEndTimestamp string + DowntimeRemainingSeconds int + IsBinlogServer bool + PseudoGTIDImmediateTopology bool + OracleGTIDImmediateTopology bool + MariaDBGTIDImmediateTopology bool + BinlogServerImmediateTopology bool + SemiSyncMasterEnabled bool + SemiSyncMasterStatus bool + SemiSyncMasterWaitForReplicaCount uint + SemiSyncMasterClients uint + CountSemiSyncReplicasEnabled uint + CountLoggingReplicas uint + CountStatementBasedLoggingReplicas uint + CountMixedBasedLoggingReplicas uint + CountRowBasedLoggingReplicas uint + CountDistinctMajorVersionsLoggingReplicas uint + CountDelayedReplicas uint + CountLaggingReplicas uint + IsActionableRecovery bool + ProcessingNodeHostname string + ProcessingNodeToken string + CountAdditionalAgreeingNodes int + StartActivePeriod string + SkippableDueToDowntime bool + GTIDMode string + MinReplicaGTIDMode string + MaxReplicaGTIDMode string + MaxReplicaGTIDErrant string + CommandHint string + IsReadOnly bool +} + +type AnalysisMap map[string](*ReplicationAnalysis) + +type ReplicationAnalysisChangelog struct { + AnalyzedInstanceKey InstanceKey + Changelog []string +} + +func (this *ReplicationAnalysis) MarshalJSON() ([]byte, error) { + i := struct { + ReplicationAnalysis + }{} + i.ReplicationAnalysis = *this + // backwards compatibility + i.SlaveHosts = i.Replicas + + return json.Marshal(i) +} + +// ReadReplicaHostsFromString parses and reads replica keys from comma delimited string +func (this *ReplicationAnalysis) ReadReplicaHostsFromString(replicaHostsString string) error { + this.Replicas = *NewInstanceKeyMap() + return this.Replicas.ReadCommaDelimitedList(replicaHostsString) +} + +// AnalysisString returns a human friendly description of all analysis issues +func (this *ReplicationAnalysis) AnalysisString() string { + result := []string{} + if this.Analysis != NoProblem { + result = append(result, string(this.Analysis)) + } + for _, structureAnalysis := range this.StructureAnalysis { + result = append(result, string(structureAnalysis)) + } + return strings.Join(result, ", ") +} + +// Get a string description of the analyzed instance type (master? co-master? intermediate-master?) +func (this *ReplicationAnalysis) GetAnalysisInstanceType() AnalysisInstanceType { + if this.IsCoMaster { + return AnalysisInstanceTypeCoMaster + } + if this.IsMaster { + return AnalysisInstanceTypeMaster + } + return AnalysisInstanceTypeIntermediateMaster +} + +// ValidSecondsFromSeenToLastAttemptedCheck returns the maximum allowed elapsed time +// between last_attempted_check to last_checked before we consider the instance as invalid. +func ValidSecondsFromSeenToLastAttemptedCheck() uint { + return config.Config.InstancePollSeconds + 1 +} diff --git a/go/vt/orchestrator/inst/analysis_dao.go b/go/vt/orchestrator/inst/analysis_dao.go new file mode 100644 index 0000000000..33fdd8548d --- /dev/null +++ b/go/vt/orchestrator/inst/analysis_dao.go @@ -0,0 +1,875 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + "vitess.io/vitess/go/vt/orchestrator/util" + + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +var analysisChangeWriteAttemptCounter = metrics.NewCounter() +var analysisChangeWriteCounter = metrics.NewCounter() + +var recentInstantAnalysis *cache.Cache + +func init() { + metrics.Register("analysis.change.write.attempt", analysisChangeWriteAttemptCounter) + metrics.Register("analysis.change.write", analysisChangeWriteCounter) + + go initializeAnalysisDaoPostConfiguration() +} + +func initializeAnalysisDaoPostConfiguration() { + config.WaitForConfigurationToBeLoaded() + + recentInstantAnalysis = cache.New(time.Duration(config.RecoveryPollSeconds*2)*time.Second, time.Second) +} + +// GetReplicationAnalysis will check for replication problems (dead master; unreachable master; etc) +func GetReplicationAnalysis(clusterName string, hints *ReplicationAnalysisHints) ([]ReplicationAnalysis, error) { + result := []ReplicationAnalysis{} + + args := sqlutils.Args(config.Config.ReasonableReplicationLagSeconds, ValidSecondsFromSeenToLastAttemptedCheck(), config.Config.ReasonableReplicationLagSeconds, clusterName) + analysisQueryReductionClause := `` + + if config.Config.ReduceReplicationAnalysisCount { + analysisQueryReductionClause = ` + HAVING + ( + MIN( + master_instance.last_checked <= master_instance.last_seen + and master_instance.last_attempted_check <= master_instance.last_seen + interval ? second + ) = 1 + /* AS is_last_check_valid */ + ) = 0 + OR ( + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running = 0 + AND replica_instance.last_io_error like '%error %connecting to master%' + AND replica_instance.slave_sql_running = 1 + ), + 0 + ) + /* AS count_replicas_failing_to_connect_to_master */ + > 0 + ) + OR ( + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + ), + 0 + ) + /* AS count_valid_replicas */ + < COUNT(replica_instance.server_id) + /* AS count_replicas */ + ) + OR ( + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running != 0 + AND replica_instance.slave_sql_running != 0 + ), + 0 + ) + /* AS count_valid_replicating_replicas */ + < COUNT(replica_instance.server_id) + /* AS count_replicas */ + ) + OR ( + MIN( + master_instance.slave_sql_running = 1 + AND master_instance.slave_io_running = 0 + AND master_instance.last_io_error like '%error %connecting to master%' + ) + /* AS is_failing_to_connect_to_master */ + ) + OR ( + COUNT(replica_instance.server_id) + /* AS count_replicas */ + > 0 + ) + ` + args = append(args, ValidSecondsFromSeenToLastAttemptedCheck()) + } + // "OR count_replicas > 0" above is a recent addition, which, granted, makes some previous conditions redundant. + // It gives more output, and more "NoProblem" messages that I am now interested in for purpose of auditing in database_instance_analysis_changelog + query := fmt.Sprintf(` + SELECT + master_instance.hostname, + master_instance.port, + master_instance.read_only AS read_only, + MIN(master_instance.data_center) AS data_center, + MIN(master_instance.region) AS region, + MIN(master_instance.physical_environment) AS physical_environment, + MIN(master_instance.master_host) AS master_host, + MIN(master_instance.master_port) AS master_port, + MIN(master_instance.cluster_name) AS cluster_name, + MIN(master_instance.binary_log_file) AS binary_log_file, + MIN(master_instance.binary_log_pos) AS binary_log_pos, + MIN( + IFNULL( + master_instance.binary_log_file = database_instance_stale_binlog_coordinates.binary_log_file + AND master_instance.binary_log_pos = database_instance_stale_binlog_coordinates.binary_log_pos + AND database_instance_stale_binlog_coordinates.first_seen < NOW() - interval ? second, + 0 + ) + ) AS is_stale_binlog_coordinates, + MIN( + IFNULL( + cluster_alias.alias, + master_instance.cluster_name + ) + ) AS cluster_alias, + MIN( + IFNULL( + cluster_domain_name.domain_name, + master_instance.cluster_name + ) + ) AS cluster_domain, + MIN( + master_instance.last_checked <= master_instance.last_seen + and master_instance.last_attempted_check <= master_instance.last_seen + interval ? second + ) = 1 AS is_last_check_valid, + /* To be considered a master, traditional async replication must not be present/valid AND the host should either */ + /* not be a replication group member OR be the primary of the replication group */ + MIN(master_instance.last_check_partial_success) as last_check_partial_success, + MIN( + ( + master_instance.master_host IN ('', '_') + OR master_instance.master_port = 0 + OR substr(master_instance.master_host, 1, 2) = '//' + ) + AND ( + master_instance.replication_group_name = '' + OR master_instance.replication_group_member_role = 'PRIMARY' + ) + ) AS is_master, + MIN(master_instance.is_co_master) AS is_co_master, + MIN( + CONCAT( + master_instance.hostname, + ':', + master_instance.port + ) = master_instance.cluster_name + ) AS is_cluster_master, + MIN(master_instance.gtid_mode) AS gtid_mode, + COUNT(replica_instance.server_id) AS count_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + ), + 0 + ) AS count_valid_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running != 0 + AND replica_instance.slave_sql_running != 0 + ), + 0 + ) AS count_valid_replicating_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.slave_io_running = 0 + AND replica_instance.last_io_error like '%%error %%connecting to master%%' + AND replica_instance.slave_sql_running = 1 + ), + 0 + ) AS count_replicas_failing_to_connect_to_master, + MIN(master_instance.replication_depth) AS replication_depth, + GROUP_CONCAT( + concat( + replica_instance.Hostname, + ':', + replica_instance.Port + ) + ) as slave_hosts, + MIN( + master_instance.slave_sql_running = 1 + AND master_instance.slave_io_running = 0 + AND master_instance.last_io_error like '%%error %%connecting to master%%' + ) AS is_failing_to_connect_to_master, + MIN( + master_downtime.downtime_active is not null + and ifnull(master_downtime.end_timestamp, now()) > now() + ) AS is_downtimed, + MIN( + IFNULL(master_downtime.end_timestamp, '') + ) AS downtime_end_timestamp, + MIN( + IFNULL( + unix_timestamp() - unix_timestamp(master_downtime.end_timestamp), + 0 + ) + ) AS downtime_remaining_seconds, + MIN( + master_instance.binlog_server + ) AS is_binlog_server, + MIN(master_instance.pseudo_gtid) AS is_pseudo_gtid, + MIN( + master_instance.supports_oracle_gtid + ) AS supports_oracle_gtid, + MIN( + master_instance.semi_sync_master_enabled + ) AS semi_sync_master_enabled, + MIN( + master_instance.semi_sync_master_wait_for_slave_count + ) AS semi_sync_master_wait_for_slave_count, + MIN( + master_instance.semi_sync_master_clients + ) AS semi_sync_master_clients, + MIN( + master_instance.semi_sync_master_status + ) AS semi_sync_master_status, + SUM(replica_instance.is_co_master) AS count_co_master_replicas, + SUM(replica_instance.oracle_gtid) AS count_oracle_gtid_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.oracle_gtid != 0 + ), + 0 + ) AS count_valid_oracle_gtid_replicas, + SUM( + replica_instance.binlog_server + ) AS count_binlog_server_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.binlog_server != 0 + ), + 0 + ) AS count_valid_binlog_server_replicas, + SUM( + replica_instance.semi_sync_replica_enabled + ) AS count_semi_sync_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.semi_sync_replica_enabled != 0 + ), + 0 + ) AS count_valid_semi_sync_replicas, + MIN( + master_instance.mariadb_gtid + ) AS is_mariadb_gtid, + SUM(replica_instance.mariadb_gtid) AS count_mariadb_gtid_replicas, + IFNULL( + SUM( + replica_instance.last_checked <= replica_instance.last_seen + AND replica_instance.mariadb_gtid != 0 + ), + 0 + ) AS count_valid_mariadb_gtid_replicas, + IFNULL( + SUM( + replica_instance.log_bin + AND replica_instance.log_slave_updates + ), + 0 + ) AS count_logging_replicas, + IFNULL( + SUM( + replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'STATEMENT' + ), + 0 + ) AS count_statement_based_logging_replicas, + IFNULL( + SUM( + replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'MIXED' + ), + 0 + ) AS count_mixed_based_logging_replicas, + IFNULL( + SUM( + replica_instance.log_bin + AND replica_instance.log_slave_updates + AND replica_instance.binlog_format = 'ROW' + ), + 0 + ) AS count_row_based_logging_replicas, + IFNULL( + SUM(replica_instance.sql_delay > 0), + 0 + ) AS count_delayed_replicas, + IFNULL( + SUM(replica_instance.slave_lag_seconds > ?), + 0 + ) AS count_lagging_replicas, + IFNULL(MIN(replica_instance.gtid_mode), '') AS min_replica_gtid_mode, + IFNULL(MAX(replica_instance.gtid_mode), '') AS max_replica_gtid_mode, + IFNULL( + MAX( + case when replica_downtime.downtime_active is not null + and ifnull(replica_downtime.end_timestamp, now()) > now() then '' else replica_instance.gtid_errant end + ), + '' + ) AS max_replica_gtid_errant, + IFNULL( + SUM( + replica_downtime.downtime_active is not null + and ifnull(replica_downtime.end_timestamp, now()) > now() + ), + 0 + ) AS count_downtimed_replicas, + COUNT( + DISTINCT case when replica_instance.log_bin + AND replica_instance.log_slave_updates then replica_instance.major_version else NULL end + ) AS count_distinct_logging_major_versions + FROM + database_instance master_instance + LEFT JOIN hostname_resolve ON ( + master_instance.hostname = hostname_resolve.hostname + ) + LEFT JOIN database_instance replica_instance ON ( + COALESCE( + hostname_resolve.resolved_hostname, + master_instance.hostname + ) = replica_instance.master_host + AND master_instance.port = replica_instance.master_port + ) + LEFT JOIN database_instance_maintenance ON ( + master_instance.hostname = database_instance_maintenance.hostname + AND master_instance.port = database_instance_maintenance.port + AND database_instance_maintenance.maintenance_active = 1 + ) + LEFT JOIN database_instance_stale_binlog_coordinates ON ( + master_instance.hostname = database_instance_stale_binlog_coordinates.hostname + AND master_instance.port = database_instance_stale_binlog_coordinates.port + ) + LEFT JOIN database_instance_downtime as master_downtime ON ( + master_instance.hostname = master_downtime.hostname + AND master_instance.port = master_downtime.port + AND master_downtime.downtime_active = 1 + ) + LEFT JOIN database_instance_downtime as replica_downtime ON ( + replica_instance.hostname = replica_downtime.hostname + AND replica_instance.port = replica_downtime.port + AND replica_downtime.downtime_active = 1 + ) + LEFT JOIN cluster_alias ON ( + cluster_alias.cluster_name = master_instance.cluster_name + ) + LEFT JOIN cluster_domain_name ON ( + cluster_domain_name.cluster_name = master_instance.cluster_name + ) + WHERE + database_instance_maintenance.database_instance_maintenance_id IS NULL + AND ? IN ('', master_instance.cluster_name) + GROUP BY + master_instance.hostname, + master_instance.port + %s + ORDER BY + is_master DESC, + is_cluster_master DESC, + count_replicas DESC + `, + analysisQueryReductionClause) + + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + a := ReplicationAnalysis{ + Analysis: NoProblem, + ProcessingNodeHostname: process.ThisHostname, + ProcessingNodeToken: util.ProcessToken.Hash, + } + + a.IsMaster = m.GetBool("is_master") + countCoMasterReplicas := m.GetUint("count_co_master_replicas") + a.IsCoMaster = m.GetBool("is_co_master") || (countCoMasterReplicas > 0) + a.AnalyzedInstanceKey = InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} + a.AnalyzedInstanceMasterKey = InstanceKey{Hostname: m.GetString("master_host"), Port: m.GetInt("master_port")} + a.AnalyzedInstanceDataCenter = m.GetString("data_center") + a.AnalyzedInstanceRegion = m.GetString("region") + a.AnalyzedInstancePhysicalEnvironment = m.GetString("physical_environment") + a.AnalyzedInstanceBinlogCoordinates = BinlogCoordinates{ + LogFile: m.GetString("binary_log_file"), + LogPos: m.GetInt64("binary_log_pos"), + Type: BinaryLog, + } + isStaleBinlogCoordinates := m.GetBool("is_stale_binlog_coordinates") + a.ClusterDetails.ClusterName = m.GetString("cluster_name") + a.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") + a.ClusterDetails.ClusterDomain = m.GetString("cluster_domain") + a.GTIDMode = m.GetString("gtid_mode") + a.LastCheckValid = m.GetBool("is_last_check_valid") + a.LastCheckPartialSuccess = m.GetBool("last_check_partial_success") + a.CountReplicas = m.GetUint("count_replicas") + a.CountValidReplicas = m.GetUint("count_valid_replicas") + a.CountValidReplicatingReplicas = m.GetUint("count_valid_replicating_replicas") + a.CountReplicasFailingToConnectToMaster = m.GetUint("count_replicas_failing_to_connect_to_master") + a.CountDowntimedReplicas = m.GetUint("count_downtimed_replicas") + a.ReplicationDepth = m.GetUint("replication_depth") + a.IsFailingToConnectToMaster = m.GetBool("is_failing_to_connect_to_master") + a.IsDowntimed = m.GetBool("is_downtimed") + a.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") + a.DowntimeRemainingSeconds = m.GetInt("downtime_remaining_seconds") + a.IsBinlogServer = m.GetBool("is_binlog_server") + a.ClusterDetails.ReadRecoveryInfo() + + a.Replicas = *NewInstanceKeyMap() + a.Replicas.ReadCommaDelimitedList(m.GetString("slave_hosts")) + + countValidOracleGTIDReplicas := m.GetUint("count_valid_oracle_gtid_replicas") + a.OracleGTIDImmediateTopology = countValidOracleGTIDReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 + countValidMariaDBGTIDReplicas := m.GetUint("count_valid_mariadb_gtid_replicas") + a.MariaDBGTIDImmediateTopology = countValidMariaDBGTIDReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 + countValidBinlogServerReplicas := m.GetUint("count_valid_binlog_server_replicas") + a.BinlogServerImmediateTopology = countValidBinlogServerReplicas == a.CountValidReplicas && a.CountValidReplicas > 0 + a.PseudoGTIDImmediateTopology = m.GetBool("is_pseudo_gtid") + a.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled") + a.SemiSyncMasterStatus = m.GetBool("semi_sync_master_status") + a.CountSemiSyncReplicasEnabled = m.GetUint("count_semi_sync_replicas") + // countValidSemiSyncReplicasEnabled := m.GetUint("count_valid_semi_sync_replicas") + a.SemiSyncMasterWaitForReplicaCount = m.GetUint("semi_sync_master_wait_for_slave_count") + a.SemiSyncMasterClients = m.GetUint("semi_sync_master_clients") + + a.MinReplicaGTIDMode = m.GetString("min_replica_gtid_mode") + a.MaxReplicaGTIDMode = m.GetString("max_replica_gtid_mode") + a.MaxReplicaGTIDErrant = m.GetString("max_replica_gtid_errant") + + a.CountLoggingReplicas = m.GetUint("count_logging_replicas") + a.CountStatementBasedLoggingReplicas = m.GetUint("count_statement_based_logging_replicas") + a.CountMixedBasedLoggingReplicas = m.GetUint("count_mixed_based_logging_replicas") + a.CountRowBasedLoggingReplicas = m.GetUint("count_row_based_logging_replicas") + a.CountDistinctMajorVersionsLoggingReplicas = m.GetUint("count_distinct_logging_major_versions") + + a.CountDelayedReplicas = m.GetUint("count_delayed_replicas") + a.CountLaggingReplicas = m.GetUint("count_lagging_replicas") + + a.IsReadOnly = m.GetUint("read_only") == 1 + + if !a.LastCheckValid { + analysisMessage := fmt.Sprintf("analysis: ClusterName: %+v, IsMaster: %+v, LastCheckValid: %+v, LastCheckPartialSuccess: %+v, CountReplicas: %+v, CountValidReplicas: %+v, CountValidReplicatingReplicas: %+v, CountLaggingReplicas: %+v, CountDelayedReplicas: %+v, CountReplicasFailingToConnectToMaster: %+v", + a.ClusterDetails.ClusterName, a.IsMaster, a.LastCheckValid, a.LastCheckPartialSuccess, a.CountReplicas, a.CountValidReplicas, a.CountValidReplicatingReplicas, a.CountLaggingReplicas, a.CountDelayedReplicas, a.CountReplicasFailingToConnectToMaster, + ) + if util.ClearToLog("analysis_dao", analysisMessage) { + log.Debugf(analysisMessage) + } + } + if a.IsMaster && !a.LastCheckValid && a.CountReplicas == 0 { + a.Analysis = DeadMasterWithoutReplicas + a.Description = "Master cannot be reached by orchestrator and has no replica" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMaster + a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMasterAndReplicas + a.Description = "Master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadMasterAndSomeReplicas + a.Description = "Master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableMasterWithLaggingReplicas + a.Description = "Master cannot be reached by orchestrator and all of its replicas are lagging" + // + } else if a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + // partial success is here to redice noise + a.Analysis = UnreachableMaster + a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsMaster && !a.LastCheckValid && a.LastCheckPartialSuccess && a.CountReplicasFailingToConnectToMaster > 0 && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + // there's partial success, but also at least one replica is failing to connect to master + a.Analysis = UnreachableMaster + a.Description = "Master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsMaster && a.SemiSyncMasterEnabled && a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount { + if isStaleBinlogCoordinates { + a.Analysis = LockedSemiSyncMaster + a.Description = "Semi sync master is locked since it doesn't get enough replica acknowledgements" + } else { + a.Analysis = LockedSemiSyncMasterHypothesis + a.Description = "Semi sync master seems to be locked, more samplings needed to validate" + } + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = MasterSingleReplicaNotReplicating + a.Description = "Master is reachable but its single replica is not replicating" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == 0 { + a.Analysis = MasterSingleReplicaDead + a.Description = "Master is reachable but its single replica is dead" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllMasterReplicasNotReplicating + a.Description = "Master is reachable but none of its replicas is replicating" + // + } else if a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllMasterReplicasNotReplicatingOrDead + a.Description = "Master is reachable but none of its replicas is replicating" + // + } else /* co-master */ if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadCoMaster + a.Description = "Co-master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if a.IsCoMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadCoMasterAndSomeReplicas + a.Description = "Co-master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if a.IsCoMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableCoMaster + a.Description = "Co-master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if a.IsCoMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllCoMasterReplicasNotReplicating + a.Description = "Co-master is reachable but none of its replicas is replicating" + // + } else /* intermediate-master */ if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountReplicasFailingToConnectToMaster == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterWithSingleReplicaFailingToConnect + a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is failing to connect" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas == 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterWithSingleReplica + a.Description = "Intermediate master cannot be reached by orchestrator and its (single) replica is not replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicas == a.CountReplicas && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMaster + a.Description = "Intermediate master cannot be reached by orchestrator and none of its replicas is replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountValidReplicas < a.CountReplicas && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = DeadIntermediateMasterAndSomeReplicas + a.Description = "Intermediate master cannot be reached by orchestrator; some of its replicas are unreachable and none of its reachable replicas is replicating" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicas == 0 { + a.Analysis = DeadIntermediateMasterAndReplicas + a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are unreachable" + // + } else if !a.IsMaster && !a.LastCheckValid && a.CountLaggingReplicas == a.CountReplicas && a.CountDelayedReplicas < a.CountReplicas && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableIntermediateMasterWithLaggingReplicas + a.Description = "Intermediate master cannot be reached by orchestrator and all of its replicas are lagging" + // + } else if !a.IsMaster && !a.LastCheckValid && !a.LastCheckPartialSuccess && a.CountValidReplicas > 0 && a.CountValidReplicatingReplicas > 0 { + a.Analysis = UnreachableIntermediateMaster + a.Description = "Intermediate master cannot be reached by orchestrator but it has replicating replicas; possibly a network/host issue" + // + } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 1 && a.CountValidReplicatingReplicas == 0 && + a.CountReplicasFailingToConnectToMaster > 0 && a.CountReplicasFailingToConnectToMaster == a.CountValidReplicas { + // All replicas are either failing to connect to master (and at least one of these have to exist) + // or completely dead. + // Must have at least two replicas to reach such conclusion -- do note that the intermediate master is still + // reachable to orchestrator, so we base our conclusion on replicas only at this point. + a.Analysis = AllIntermediateMasterReplicasFailingToConnectOrDead + a.Description = "Intermediate master is reachable but all of its replicas are failing to connect" + // + } else if !a.IsMaster && a.LastCheckValid && a.CountReplicas > 0 && a.CountValidReplicatingReplicas == 0 { + a.Analysis = AllIntermediateMasterReplicasNotReplicating + a.Description = "Intermediate master is reachable but none of its replicas is replicating" + // + } else if a.IsBinlogServer && a.IsFailingToConnectToMaster { + a.Analysis = BinlogServerFailingToConnectToMaster + a.Description = "Binlog server is unable to connect to its master" + // + } else if a.ReplicationDepth == 1 && a.IsFailingToConnectToMaster { + a.Analysis = FirstTierReplicaFailingToConnectToMaster + a.Description = "1st tier replica (directly replicating from topology master) is unable to connect to the master" + // + } + // else if a.IsMaster && a.CountReplicas == 0 { + // a.Analysis = MasterWithoutReplicas + // a.Description = "Master has no replicas" + // } + + appendAnalysis := func(analysis *ReplicationAnalysis) { + if a.Analysis == NoProblem && len(a.StructureAnalysis) == 0 && !hints.IncludeNoProblem { + return + } + for _, filter := range config.Config.RecoveryIgnoreHostnameFilters { + if matched, _ := regexp.MatchString(filter, a.AnalyzedInstanceKey.Hostname); matched { + return + } + } + if a.IsDowntimed { + a.SkippableDueToDowntime = true + } + if a.CountReplicas == a.CountDowntimedReplicas { + switch a.Analysis { + case AllMasterReplicasNotReplicating, + AllMasterReplicasNotReplicatingOrDead, + MasterSingleReplicaDead, + AllCoMasterReplicasNotReplicating, + DeadIntermediateMasterWithSingleReplica, + DeadIntermediateMasterWithSingleReplicaFailingToConnect, + DeadIntermediateMasterAndReplicas, + DeadIntermediateMasterAndSomeReplicas, + AllIntermediateMasterReplicasFailingToConnectOrDead, + AllIntermediateMasterReplicasNotReplicating: + a.IsReplicasDowntimed = true + a.SkippableDueToDowntime = true + } + } + if a.SkippableDueToDowntime && !hints.IncludeDowntimed { + return + } + result = append(result, a) + } + + { + // Moving on to structure analysis + // We also do structural checks. See if there's potential danger in promotions + if a.IsMaster && a.CountLoggingReplicas == 0 && a.CountReplicas > 1 { + a.StructureAnalysis = append(a.StructureAnalysis, NoLoggingReplicasStructureWarning) + } + if a.IsMaster && a.CountReplicas > 1 && + !a.OracleGTIDImmediateTopology && + !a.MariaDBGTIDImmediateTopology && + !a.BinlogServerImmediateTopology && + !a.PseudoGTIDImmediateTopology { + a.StructureAnalysis = append(a.StructureAnalysis, NoFailoverSupportStructureWarning) + } + if a.IsMaster && a.CountStatementBasedLoggingReplicas > 0 && a.CountMixedBasedLoggingReplicas > 0 { + a.StructureAnalysis = append(a.StructureAnalysis, StatementAndMixedLoggingReplicasStructureWarning) + } + if a.IsMaster && a.CountStatementBasedLoggingReplicas > 0 && a.CountRowBasedLoggingReplicas > 0 { + a.StructureAnalysis = append(a.StructureAnalysis, StatementAndRowLoggingReplicasStructureWarning) + } + if a.IsMaster && a.CountMixedBasedLoggingReplicas > 0 && a.CountRowBasedLoggingReplicas > 0 { + a.StructureAnalysis = append(a.StructureAnalysis, MixedAndRowLoggingReplicasStructureWarning) + } + if a.IsMaster && a.CountDistinctMajorVersionsLoggingReplicas > 1 { + a.StructureAnalysis = append(a.StructureAnalysis, MultipleMajorVersionsLoggingReplicasStructureWarning) + } + + if a.CountReplicas > 0 && (a.GTIDMode != a.MinReplicaGTIDMode || a.GTIDMode != a.MaxReplicaGTIDMode) { + a.StructureAnalysis = append(a.StructureAnalysis, DifferentGTIDModesStructureWarning) + } + if a.MaxReplicaGTIDErrant != "" { + a.StructureAnalysis = append(a.StructureAnalysis, ErrantGTIDStructureWarning) + } + + if a.IsMaster && a.IsReadOnly { + a.StructureAnalysis = append(a.StructureAnalysis, NoWriteableMasterStructureWarning) + } + + if a.IsMaster && a.SemiSyncMasterEnabled && !a.SemiSyncMasterStatus && a.SemiSyncMasterWaitForReplicaCount > 0 && a.SemiSyncMasterClients < a.SemiSyncMasterWaitForReplicaCount { + a.StructureAnalysis = append(a.StructureAnalysis, NotEnoughValidSemiSyncReplicasStructureWarning) + } + } + appendAnalysis(&a) + + if a.CountReplicas > 0 && hints.AuditAnalysis { + // Interesting enough for analysis + go auditInstanceAnalysisInChangelog(&a.AnalyzedInstanceKey, a.Analysis) + } + return nil + }) + + if err != nil { + return result, log.Errore(err) + } + // TODO: result, err = getConcensusReplicationAnalysis(result) + return result, log.Errore(err) +} + +func getConcensusReplicationAnalysis(analysisEntries []ReplicationAnalysis) ([]ReplicationAnalysis, error) { + if !orcraft.IsRaftEnabled() { + return analysisEntries, nil + } + if !config.Config.ExpectFailureAnalysisConcensus { + return analysisEntries, nil + } + concensusAnalysisEntries := []ReplicationAnalysis{} + peerAnalysisMap, err := ReadPeerAnalysisMap() + if err != nil { + return analysisEntries, err + } + quorumSize, err := orcraft.QuorumSize() + if err != nil { + return analysisEntries, err + } + + for _, analysisEntry := range analysisEntries { + instanceAnalysis := NewInstanceAnalysis(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + analysisKey := instanceAnalysis.String() + + peerAnalysisCount := peerAnalysisMap[analysisKey] + if 1+peerAnalysisCount >= quorumSize { + // this node and enough other nodes in agreement + concensusAnalysisEntries = append(concensusAnalysisEntries, analysisEntry) + } + } + return concensusAnalysisEntries, nil +} + +// auditInstanceAnalysisInChangelog will write down an instance's analysis in the database_instance_analysis_changelog table. +// To not repeat recurring analysis code, the database_instance_last_analysis table is used, so that only changes to +// analysis codes are written. +func auditInstanceAnalysisInChangelog(instanceKey *InstanceKey, analysisCode AnalysisCode) error { + if lastWrittenAnalysis, found := recentInstantAnalysis.Get(instanceKey.DisplayString()); found { + if lastWrittenAnalysis == analysisCode { + // Surely nothing new. + // And let's expand the timeout + recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) + return nil + } + } + // Passed the cache; but does database agree that there's a change? Here's a persistent cache; this comes here + // to verify no two orchestrator services are doing this without coordinating (namely, one dies, the other taking its place + // and has no familiarity of the former's cache) + analysisChangeWriteAttemptCounter.Inc(1) + + lastAnalysisChanged := false + { + sqlResult, err := db.ExecOrchestrator(` + update database_instance_last_analysis set + analysis = ?, + analysis_timestamp = now() + where + hostname = ? + and port = ? + and analysis != ? + `, + string(analysisCode), instanceKey.Hostname, instanceKey.Port, string(analysisCode), + ) + if err != nil { + return log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return log.Errore(err) + } + lastAnalysisChanged = (rows > 0) + } + if !lastAnalysisChanged { + _, err := db.ExecOrchestrator(` + insert ignore into database_instance_last_analysis ( + hostname, port, analysis_timestamp, analysis + ) values ( + ?, ?, now(), ? + ) + `, + instanceKey.Hostname, instanceKey.Port, string(analysisCode), + ) + if err != nil { + return log.Errore(err) + } + } + recentInstantAnalysis.Set(instanceKey.DisplayString(), analysisCode, cache.DefaultExpiration) + if !lastAnalysisChanged { + return nil + } + + _, err := db.ExecOrchestrator(` + insert into database_instance_analysis_changelog ( + hostname, port, analysis_timestamp, analysis + ) values ( + ?, ?, now(), ? + ) + `, + instanceKey.Hostname, instanceKey.Port, string(analysisCode), + ) + if err == nil { + analysisChangeWriteCounter.Inc(1) + } + return log.Errore(err) +} + +// ExpireInstanceAnalysisChangelog removes old-enough analysis entries from the changelog +func ExpireInstanceAnalysisChangelog() error { + _, err := db.ExecOrchestrator(` + delete + from database_instance_analysis_changelog + where + analysis_timestamp < now() - interval ? hour + `, + config.Config.UnseenInstanceForgetHours, + ) + return log.Errore(err) +} + +// ReadReplicationAnalysisChangelog +func ReadReplicationAnalysisChangelog() (res [](*ReplicationAnalysisChangelog), err error) { + query := ` + select + hostname, + port, + analysis_timestamp, + analysis + from + database_instance_analysis_changelog + order by + hostname, port, changelog_id + ` + analysisChangelog := &ReplicationAnalysisChangelog{} + err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + key := InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} + + if !analysisChangelog.AnalyzedInstanceKey.Equals(&key) { + analysisChangelog = &ReplicationAnalysisChangelog{AnalyzedInstanceKey: key, Changelog: []string{}} + res = append(res, analysisChangelog) + } + analysisEntry := fmt.Sprintf("%s;%s,", m.GetString("analysis_timestamp"), m.GetString("analysis")) + analysisChangelog.Changelog = append(analysisChangelog.Changelog, analysisEntry) + + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// ReadPeerAnalysisMap reads raft-peer failure analysis, and returns a PeerAnalysisMap, +// indicating how many peers see which analysis +func ReadPeerAnalysisMap() (peerAnalysisMap PeerAnalysisMap, err error) { + peerAnalysisMap = make(PeerAnalysisMap) + query := ` + select + hostname, + port, + analysis + from + database_instance_peer_analysis + order by + peer, hostname, port + ` + err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + instanceKey := InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} + analysis := m.GetString("analysis") + instanceAnalysis := NewInstanceAnalysis(&instanceKey, AnalysisCode(analysis)) + mapKey := instanceAnalysis.String() + peerAnalysisMap[mapKey] = peerAnalysisMap[mapKey] + 1 + + return nil + }) + return peerAnalysisMap, log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/analysis_test.go b/go/vt/orchestrator/inst/analysis_test.go new file mode 100644 index 0000000000..76d0dfd400 --- /dev/null +++ b/go/vt/orchestrator/inst/analysis_test.go @@ -0,0 +1,50 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestGetAnalysisInstanceType(t *testing.T) { + { + analysis := &ReplicationAnalysis{} + test.S(t).ExpectEquals(string(analysis.GetAnalysisInstanceType()), "intermediate-master") + } + { + analysis := &ReplicationAnalysis{IsMaster: true} + test.S(t).ExpectEquals(string(analysis.GetAnalysisInstanceType()), "master") + } + { + analysis := &ReplicationAnalysis{IsCoMaster: true} + test.S(t).ExpectEquals(string(analysis.GetAnalysisInstanceType()), "co-master") + } + { + analysis := &ReplicationAnalysis{IsMaster: true, IsCoMaster: true} + test.S(t).ExpectEquals(string(analysis.GetAnalysisInstanceType()), "co-master") + } +} diff --git a/go/vt/orchestrator/inst/audit.go b/go/vt/orchestrator/inst/audit.go new file mode 100644 index 0000000000..4b87556bb6 --- /dev/null +++ b/go/vt/orchestrator/inst/audit.go @@ -0,0 +1,26 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +// Audit presents a single audit entry (namely in the database) +type Audit struct { + AuditId int64 + AuditTimestamp string + AuditType string + AuditInstanceKey InstanceKey + Message string +} diff --git a/go/vt/orchestrator/inst/audit_dao.go b/go/vt/orchestrator/inst/audit_dao.go new file mode 100644 index 0000000000..c063c3399a --- /dev/null +++ b/go/vt/orchestrator/inst/audit_dao.go @@ -0,0 +1,160 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "log/syslog" + "os" + "time" + + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// syslogWriter is optional, and defaults to nil (disabled) +var syslogWriter *syslog.Writer + +var auditOperationCounter = metrics.NewCounter() + +func init() { + metrics.Register("audit.write", auditOperationCounter) +} + +// EnableSyslogWriter enables, if possible, writes to syslog. These will execute _in addition_ to normal logging +func EnableAuditSyslog() (err error) { + syslogWriter, err = syslog.New(syslog.LOG_ERR, "orchestrator") + if err != nil { + syslogWriter = nil + } + return err +} + +// AuditOperation creates and writes a new audit entry by given params +func AuditOperation(auditType string, instanceKey *InstanceKey, message string) error { + if instanceKey == nil { + instanceKey = &InstanceKey{} + } + clusterName := "" + if instanceKey.Hostname != "" { + clusterName, _ = GetClusterName(instanceKey) + } + + auditWrittenToFile := false + if config.Config.AuditLogFile != "" { + auditWrittenToFile = true + go func() error { + f, err := os.OpenFile(config.Config.AuditLogFile, os.O_RDWR|os.O_CREATE|os.O_APPEND, 0640) + if err != nil { + return log.Errore(err) + } + + defer f.Close() + text := fmt.Sprintf("%s\t%s\t%s\t%d\t[%s]\t%s\t\n", time.Now().Format(log.TimeFormat), auditType, instanceKey.Hostname, instanceKey.Port, clusterName, message) + if _, err = f.WriteString(text); err != nil { + return log.Errore(err) + } + return nil + }() + } + if config.Config.AuditToBackendDB { + _, err := db.ExecOrchestrator(` + insert + into audit ( + audit_timestamp, audit_type, hostname, port, cluster_name, message + ) VALUES ( + NOW(), ?, ?, ?, ?, ? + ) + `, + auditType, + instanceKey.Hostname, + instanceKey.Port, + clusterName, + message, + ) + if err != nil { + return log.Errore(err) + } + } + logMessage := fmt.Sprintf("auditType:%s instance:%s cluster:%s message:%s", auditType, instanceKey.DisplayString(), clusterName, message) + if syslogWriter != nil { + auditWrittenToFile = true + go func() { + syslogWriter.Info(logMessage) + }() + } + if !auditWrittenToFile { + log.Infof(logMessage) + } + auditOperationCounter.Inc(1) + + return nil +} + +// ReadRecentAudit returns a list of audit entries order chronologically descending, using page number. +func ReadRecentAudit(instanceKey *InstanceKey, page int) ([]Audit, error) { + res := []Audit{} + args := sqlutils.Args() + whereCondition := `` + if instanceKey != nil { + whereCondition = `where hostname=? and port=?` + args = append(args, instanceKey.Hostname, instanceKey.Port) + } + query := fmt.Sprintf(` + select + audit_id, + audit_timestamp, + audit_type, + hostname, + port, + message + from + audit + %s + order by + audit_timestamp desc + limit ? + offset ? + `, whereCondition) + args = append(args, config.AuditPageSize, page*config.AuditPageSize) + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + audit := Audit{} + audit.AuditId = m.GetInt64("audit_id") + audit.AuditTimestamp = m.GetString("audit_timestamp") + audit.AuditType = m.GetString("audit_type") + audit.AuditInstanceKey.Hostname = m.GetString("hostname") + audit.AuditInstanceKey.Port = m.GetInt("port") + audit.Message = m.GetString("message") + + res = append(res, audit) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err + +} + +// ExpireAudit removes old rows from the audit table +func ExpireAudit() error { + return ExpireTableData("audit", "audit_timestamp") +} diff --git a/go/vt/orchestrator/inst/binlog.go b/go/vt/orchestrator/inst/binlog.go new file mode 100644 index 0000000000..8e6887b012 --- /dev/null +++ b/go/vt/orchestrator/inst/binlog.go @@ -0,0 +1,196 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "errors" + "fmt" + "regexp" + "strconv" + "strings" +) + +var detachPattern *regexp.Regexp + +func init() { + detachPattern, _ = regexp.Compile(`//([^/:]+):([\d]+)`) // e.g. `//binlog.01234:567890` +} + +type BinlogType int + +const ( + BinaryLog BinlogType = iota + RelayLog +) + +// BinlogCoordinates described binary log coordinates in the form of log file & log position. +type BinlogCoordinates struct { + LogFile string + LogPos int64 + Type BinlogType +} + +// rpad formats the binlog coordinates to a given size. If the size +// increases this value is modified so it can be reused later. This +// is to ensure consistent formatting in debug output. +func rpad(coordinates BinlogCoordinates, length *int) string { + s := fmt.Sprintf("%+v", coordinates) + if len(s) > *length { + *length = len(s) + } + + if len(s) >= *length { + return s + } + return fmt.Sprintf("%s%s", s, strings.Repeat(" ", *length-len(s))) +} + +// ParseInstanceKey will parse an InstanceKey from a string representation such as 127.0.0.1:3306 +func ParseBinlogCoordinates(logFileLogPos string) (*BinlogCoordinates, error) { + tokens := strings.SplitN(logFileLogPos, ":", 2) + if len(tokens) != 2 { + return nil, fmt.Errorf("ParseBinlogCoordinates: Cannot parse BinlogCoordinates from %s. Expected format is file:pos", logFileLogPos) + } + + if logPos, err := strconv.ParseInt(tokens[1], 10, 0); err != nil { + return nil, fmt.Errorf("ParseBinlogCoordinates: invalid pos: %s", tokens[1]) + } else { + return &BinlogCoordinates{LogFile: tokens[0], LogPos: logPos}, nil + } +} + +// DisplayString returns a user-friendly string representation of these coordinates +func (this *BinlogCoordinates) DisplayString() string { + return fmt.Sprintf("%s:%d", this.LogFile, this.LogPos) +} + +// String returns a user-friendly string representation of these coordinates +func (this BinlogCoordinates) String() string { + return this.DisplayString() +} + +// Equals tests equality of this corrdinate and another one. +func (this *BinlogCoordinates) Equals(other *BinlogCoordinates) bool { + if other == nil { + return false + } + return this.LogFile == other.LogFile && this.LogPos == other.LogPos && this.Type == other.Type +} + +// IsEmpty returns true if the log file is empty, unnamed +func (this *BinlogCoordinates) IsEmpty() bool { + return this.LogFile == "" +} + +// SmallerThan returns true if this coordinate is strictly smaller than the other. +func (this *BinlogCoordinates) SmallerThan(other *BinlogCoordinates) bool { + if this.LogFile < other.LogFile { + return true + } + if this.LogFile == other.LogFile && this.LogPos < other.LogPos { + return true + } + return false +} + +// SmallerThanOrEquals returns true if this coordinate is the same or equal to the other one. +// We do NOT compare the type so we can not use this.Equals() +func (this *BinlogCoordinates) SmallerThanOrEquals(other *BinlogCoordinates) bool { + if this.SmallerThan(other) { + return true + } + return this.LogFile == other.LogFile && this.LogPos == other.LogPos // No Type comparison +} + +// FileSmallerThan returns true if this coordinate's file is strictly smaller than the other's. +func (this *BinlogCoordinates) FileSmallerThan(other *BinlogCoordinates) bool { + return this.LogFile < other.LogFile +} + +// FileNumberDistance returns the numeric distance between this corrdinate's file number and the other's. +// Effectively it means "how many roatets/FLUSHes would make these coordinates's file reach the other's" +func (this *BinlogCoordinates) FileNumberDistance(other *BinlogCoordinates) int { + thisNumber, _ := this.FileNumber() + otherNumber, _ := other.FileNumber() + return otherNumber - thisNumber +} + +// FileNumber returns the numeric value of the file, and the length in characters representing the number in the filename. +// Example: FileNumber() of mysqld.log.000789 is (789, 6) +func (this *BinlogCoordinates) FileNumber() (int, int) { + tokens := strings.Split(this.LogFile, ".") + numPart := tokens[len(tokens)-1] + numLen := len(numPart) + fileNum, err := strconv.Atoi(numPart) + if err != nil { + return 0, 0 + } + return fileNum, numLen +} + +// PreviousFileCoordinatesBy guesses the filename of the previous binlog/relaylog, by given offset (number of files back) +func (this *BinlogCoordinates) PreviousFileCoordinatesBy(offset int) (BinlogCoordinates, error) { + result := BinlogCoordinates{LogPos: 0, Type: this.Type} + + fileNum, numLen := this.FileNumber() + if fileNum == 0 { + return result, errors.New("Log file number is zero, cannot detect previous file") + } + newNumStr := fmt.Sprintf("%d", (fileNum - offset)) + newNumStr = strings.Repeat("0", numLen-len(newNumStr)) + newNumStr + + tokens := strings.Split(this.LogFile, ".") + tokens[len(tokens)-1] = newNumStr + result.LogFile = strings.Join(tokens, ".") + return result, nil +} + +// PreviousFileCoordinates guesses the filename of the previous binlog/relaylog +func (this *BinlogCoordinates) PreviousFileCoordinates() (BinlogCoordinates, error) { + return this.PreviousFileCoordinatesBy(1) +} + +// PreviousFileCoordinates guesses the filename of the previous binlog/relaylog +func (this *BinlogCoordinates) NextFileCoordinates() (BinlogCoordinates, error) { + result := BinlogCoordinates{LogPos: 0, Type: this.Type} + + fileNum, numLen := this.FileNumber() + newNumStr := fmt.Sprintf("%d", (fileNum + 1)) + newNumStr = strings.Repeat("0", numLen-len(newNumStr)) + newNumStr + + tokens := strings.Split(this.LogFile, ".") + tokens[len(tokens)-1] = newNumStr + result.LogFile = strings.Join(tokens, ".") + return result, nil +} + +// Detach returns a detahced form of coordinates +func (this *BinlogCoordinates) Detach() (detachedCoordinates BinlogCoordinates) { + detachedCoordinates = BinlogCoordinates{LogFile: fmt.Sprintf("//%s:%d", this.LogFile, this.LogPos), LogPos: this.LogPos} + return detachedCoordinates +} + +// FileSmallerThan returns true if this coordinate's file is strictly smaller than the other's. +func (this *BinlogCoordinates) ExtractDetachedCoordinates() (isDetached bool, detachedCoordinates BinlogCoordinates) { + detachedCoordinatesSubmatch := detachPattern.FindStringSubmatch(this.LogFile) + if len(detachedCoordinatesSubmatch) == 0 { + return false, *this + } + detachedCoordinates.LogFile = detachedCoordinatesSubmatch[1] + detachedCoordinates.LogPos, _ = strconv.ParseInt(detachedCoordinatesSubmatch[2], 10, 0) + return true, detachedCoordinates +} diff --git a/go/vt/orchestrator/inst/binlog_test.go b/go/vt/orchestrator/inst/binlog_test.go new file mode 100644 index 0000000000..98eb241053 --- /dev/null +++ b/go/vt/orchestrator/inst/binlog_test.go @@ -0,0 +1,138 @@ +package inst + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +var testCoordinates = BinlogCoordinates{LogFile: "mysql-bin.000010", LogPos: 108} + +func init() { + config.Config.HostnameResolveMethod = "none" + config.Config.KVClusterMasterPrefix = "test/master/" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestDetach(t *testing.T) { + detachedCoordinates := testCoordinates.Detach() + test.S(t).ExpectEquals(detachedCoordinates.LogFile, "//mysql-bin.000010:108") + test.S(t).ExpectEquals(detachedCoordinates.LogPos, testCoordinates.LogPos) +} + +func TestDetachedCoordinates(t *testing.T) { + isDetached, detachedCoordinates := testCoordinates.ExtractDetachedCoordinates() + test.S(t).ExpectFalse(isDetached) + test.S(t).ExpectEquals(detachedCoordinates.LogFile, testCoordinates.LogFile) + test.S(t).ExpectEquals(detachedCoordinates.LogPos, testCoordinates.LogPos) +} + +func TestDetachedCoordinates2(t *testing.T) { + detached := testCoordinates.Detach() + isDetached, coordinates := detached.ExtractDetachedCoordinates() + + test.S(t).ExpectTrue(isDetached) + test.S(t).ExpectEquals(coordinates.LogFile, testCoordinates.LogFile) + test.S(t).ExpectEquals(coordinates.LogPos, testCoordinates.LogPos) +} + +func TestPreviousFileCoordinates(t *testing.T) { + previous, err := testCoordinates.PreviousFileCoordinates() + + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(previous.LogFile, "mysql-bin.000009") + test.S(t).ExpectEquals(previous.LogPos, int64(0)) +} + +func TestNextFileCoordinates(t *testing.T) { + next, err := testCoordinates.NextFileCoordinates() + + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(next.LogFile, "mysql-bin.000011") + test.S(t).ExpectEquals(next.LogPos, int64(0)) +} + +func TestBinlogCoordinates(t *testing.T) { + c1 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + c2 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + c3 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 5000} + c4 := BinlogCoordinates{LogFile: "mysql-bin.00112", LogPos: 104} + + test.S(t).ExpectTrue(c1.Equals(&c2)) + test.S(t).ExpectFalse(c1.Equals(&c3)) + test.S(t).ExpectFalse(c1.Equals(&c4)) + test.S(t).ExpectFalse(c1.SmallerThan(&c2)) + test.S(t).ExpectTrue(c1.SmallerThan(&c3)) + test.S(t).ExpectTrue(c1.SmallerThan(&c4)) + test.S(t).ExpectTrue(c3.SmallerThan(&c4)) + test.S(t).ExpectFalse(c3.SmallerThan(&c2)) + test.S(t).ExpectFalse(c4.SmallerThan(&c2)) + test.S(t).ExpectFalse(c4.SmallerThan(&c3)) + + test.S(t).ExpectTrue(c1.SmallerThanOrEquals(&c2)) + test.S(t).ExpectTrue(c1.SmallerThanOrEquals(&c3)) +} + +func TestBinlogPrevious(t *testing.T) { + c1 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + cres, err := c1.PreviousFileCoordinates() + + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c1.Type, cres.Type) + test.S(t).ExpectEquals(cres.LogFile, "mysql-bin.00016") + + c2 := BinlogCoordinates{LogFile: "mysql-bin.00100", LogPos: 104} + cres, err = c2.PreviousFileCoordinates() + + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c1.Type, cres.Type) + test.S(t).ExpectEquals(cres.LogFile, "mysql-bin.00099") + + c3 := BinlogCoordinates{LogFile: "mysql.00.prod.com.00100", LogPos: 104} + cres, err = c3.PreviousFileCoordinates() + + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(c1.Type, cres.Type) + test.S(t).ExpectEquals(cres.LogFile, "mysql.00.prod.com.00099") + + c4 := BinlogCoordinates{LogFile: "mysql.00.prod.com.00000", LogPos: 104} + _, err = c4.PreviousFileCoordinates() + + test.S(t).ExpectNotNil(err) +} + +func TestBinlogCoordinatesAsKey(t *testing.T) { + m := make(map[BinlogCoordinates]bool) + + c1 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + c2 := BinlogCoordinates{LogFile: "mysql-bin.00022", LogPos: 104} + c3 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + c4 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 222} + + m[c1] = true + m[c2] = true + m[c3] = true + m[c4] = true + + test.S(t).ExpectEquals(len(m), 3) +} + +func TestFileNumberDistance(t *testing.T) { + c1 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + c2 := BinlogCoordinates{LogFile: "mysql-bin.00022", LogPos: 104} + + test.S(t).ExpectEquals(c1.FileNumberDistance(&c1), 0) + test.S(t).ExpectEquals(c1.FileNumberDistance(&c2), 5) + test.S(t).ExpectEquals(c2.FileNumberDistance(&c1), -5) +} + +func TestFileNumber(t *testing.T) { + c1 := BinlogCoordinates{LogFile: "mysql-bin.00017", LogPos: 104} + fileNum, numLen := c1.FileNumber() + + test.S(t).ExpectEquals(fileNum, 17) + test.S(t).ExpectEquals(numLen, 5) +} diff --git a/go/vt/orchestrator/inst/candidate_database_instance.go b/go/vt/orchestrator/inst/candidate_database_instance.go new file mode 100644 index 0000000000..493a1e7034 --- /dev/null +++ b/go/vt/orchestrator/inst/candidate_database_instance.go @@ -0,0 +1,55 @@ +/* + Copyright 2016 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/db" +) + +// CandidateDatabaseInstance contains information about explicit promotion rules for an instance +type CandidateDatabaseInstance struct { + Hostname string + Port int + PromotionRule CandidatePromotionRule + LastSuggestedString string + PromotionRuleExpiry string // generated when retrieved from database for consistency reasons +} + +func NewCandidateDatabaseInstance(instanceKey *InstanceKey, promotionRule CandidatePromotionRule) *CandidateDatabaseInstance { + return &CandidateDatabaseInstance{ + Hostname: instanceKey.Hostname, + Port: instanceKey.Port, + PromotionRule: promotionRule, + } +} + +func (cdi *CandidateDatabaseInstance) WithCurrentTime() *CandidateDatabaseInstance { + cdi.LastSuggestedString, _ = db.ReadTimeNow() + return cdi +} + +// String returns a string representation of the CandidateDatabaseInstance struct +func (cdi *CandidateDatabaseInstance) String() string { + return fmt.Sprintf("%s:%d %s", cdi.Hostname, cdi.Port, cdi.PromotionRule) +} + +// Key returns an instance key representing this candidate +func (cdi *CandidateDatabaseInstance) Key() *InstanceKey { + return &InstanceKey{Hostname: cdi.Hostname, Port: cdi.Port} +} diff --git a/go/vt/orchestrator/inst/candidate_database_instance_dao.go b/go/vt/orchestrator/inst/candidate_database_instance_dao.go new file mode 100644 index 0000000000..0bcc5e8e95 --- /dev/null +++ b/go/vt/orchestrator/inst/candidate_database_instance_dao.go @@ -0,0 +1,109 @@ +/* + Copyright 2016 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" +) + +// RegisterCandidateInstance markes a given instance as suggested for successoring a master in the event of failover. +func RegisterCandidateInstance(candidate *CandidateDatabaseInstance) error { + if candidate.LastSuggestedString == "" { + candidate = candidate.WithCurrentTime() + } + args := sqlutils.Args(candidate.Hostname, candidate.Port, string(candidate.PromotionRule), candidate.LastSuggestedString) + + query := fmt.Sprintf(` + insert into candidate_database_instance ( + hostname, + port, + promotion_rule, + last_suggested + ) values ( + ?, ?, ?, ? + ) on duplicate key update + last_suggested=values(last_suggested), + promotion_rule=values(promotion_rule) + `) + writeFunc := func() error { + _, err := db.ExecOrchestrator(query, args...) + AuditOperation("register-candidate", candidate.Key(), string(candidate.PromotionRule)) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// ExpireCandidateInstances removes stale master candidate suggestions. +func ExpireCandidateInstances() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from candidate_database_instance + where last_suggested < NOW() - INTERVAL ? MINUTE + `, config.Config.CandidateInstanceExpireMinutes, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// BulkReadCandidateDatabaseInstance returns a slice of +// CandidateDatabaseInstance converted to JSON. +/* +root@myorchestrator [orchestrator]> select * from candidate_database_instance; ++-------------------+------+---------------------+----------+----------------+ +| hostname | port | last_suggested | priority | promotion_rule | ++-------------------+------+---------------------+----------+----------------+ +| host1.example.com | 3306 | 2016-11-22 17:41:06 | 1 | prefer | +| host2.example.com | 3306 | 2016-11-22 17:40:24 | 1 | prefer | ++-------------------+------+---------------------+----------+----------------+ +2 rows in set (0.00 sec) +*/ +func BulkReadCandidateDatabaseInstance() ([]CandidateDatabaseInstance, error) { + var candidateDatabaseInstances []CandidateDatabaseInstance + + // Read all promotion rules from the table + query := ` + SELECT + hostname, + port, + promotion_rule, + last_suggested, + last_suggested + INTERVAL ? MINUTE AS promotion_rule_expiry + FROM + candidate_database_instance + ` + err := db.QueryOrchestrator(query, sqlutils.Args(config.Config.CandidateInstanceExpireMinutes), func(m sqlutils.RowMap) error { + cdi := CandidateDatabaseInstance{ + Hostname: m.GetString("hostname"), + Port: m.GetInt("port"), + PromotionRule: CandidatePromotionRule(m.GetString("promotion_rule")), + LastSuggestedString: m.GetString("last_suggested"), + PromotionRuleExpiry: m.GetString("promotion_rule_expiry"), + } + // add to end of candidateDatabaseInstances + candidateDatabaseInstances = append(candidateDatabaseInstances, cdi) + + return nil + }) + return candidateDatabaseInstances, err +} diff --git a/go/vt/orchestrator/inst/cluster.go b/go/vt/orchestrator/inst/cluster.go new file mode 100644 index 0000000000..d5bb4ca94b --- /dev/null +++ b/go/vt/orchestrator/inst/cluster.go @@ -0,0 +1,136 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/kv" +) + +func GetClusterMasterKVKey(clusterAlias string) string { + return fmt.Sprintf("%s%s", config.Config.KVClusterMasterPrefix, clusterAlias) +} + +func getClusterMasterKVPair(clusterAlias string, masterKey *InstanceKey) *kv.KVPair { + if clusterAlias == "" { + return nil + } + if masterKey == nil { + return nil + } + return kv.NewKVPair(GetClusterMasterKVKey(clusterAlias), masterKey.StringCode()) +} + +// GetClusterMasterKVPairs returns all KV pairs associated with a master. This includes the +// full identity of the master as well as a breakdown by hostname, port, ipv4, ipv6 +func GetClusterMasterKVPairs(clusterAlias string, masterKey *InstanceKey) (kvPairs [](*kv.KVPair)) { + masterKVPair := getClusterMasterKVPair(clusterAlias, masterKey) + if masterKVPair == nil { + return kvPairs + } + kvPairs = append(kvPairs, masterKVPair) + + addPair := func(keySuffix, value string) { + key := fmt.Sprintf("%s/%s", masterKVPair.Key, keySuffix) + kvPairs = append(kvPairs, kv.NewKVPair(key, value)) + } + + addPair("hostname", masterKey.Hostname) + addPair("port", fmt.Sprintf("%d", masterKey.Port)) + if ipv4, ipv6, err := readHostnameIPs(masterKey.Hostname); err == nil { + addPair("ipv4", ipv4) + addPair("ipv6", ipv6) + } + return kvPairs +} + +// mappedClusterNameToAlias attempts to match a cluster with an alias based on +// configured ClusterNameToAlias map +func mappedClusterNameToAlias(clusterName string) string { + for pattern, alias := range config.Config.ClusterNameToAlias { + if pattern == "" { + // sanity + continue + } + if matched, _ := regexp.MatchString(pattern, clusterName); matched { + return alias + } + } + return "" +} + +// ClusterInfo makes for a cluster status/info summary +type ClusterInfo struct { + ClusterName string + ClusterAlias string // Human friendly alias + ClusterDomain string // CNAME/VIP/A-record/whatever of the master of this cluster + CountInstances uint + HeuristicLag int64 + HasAutomatedMasterRecovery bool + HasAutomatedIntermediateMasterRecovery bool +} + +// ReadRecoveryInfo +func (this *ClusterInfo) ReadRecoveryInfo() { + this.HasAutomatedMasterRecovery = this.filtersMatchCluster(config.Config.RecoverMasterClusterFilters) + this.HasAutomatedIntermediateMasterRecovery = this.filtersMatchCluster(config.Config.RecoverIntermediateMasterClusterFilters) +} + +// filtersMatchCluster will see whether the given filters match the given cluster details +func (this *ClusterInfo) filtersMatchCluster(filters []string) bool { + for _, filter := range filters { + if filter == this.ClusterName { + return true + } + if filter == this.ClusterAlias { + return true + } + if strings.HasPrefix(filter, "alias=") { + // Match by exact cluster alias name + alias := strings.SplitN(filter, "=", 2)[1] + if alias == this.ClusterAlias { + return true + } + } else if strings.HasPrefix(filter, "alias~=") { + // Match by cluster alias regex + aliasPattern := strings.SplitN(filter, "~=", 2)[1] + if matched, _ := regexp.MatchString(aliasPattern, this.ClusterAlias); matched { + return true + } + } else if filter == "*" { + return true + } else if matched, _ := regexp.MatchString(filter, this.ClusterName); matched && filter != "" { + return true + } + } + return false +} + +// ApplyClusterAlias updates the given clusterInfo's ClusterAlias property +func (this *ClusterInfo) ApplyClusterAlias() { + if this.ClusterAlias != "" && this.ClusterAlias != this.ClusterName { + // Already has an alias; abort + return + } + if alias := mappedClusterNameToAlias(this.ClusterName); alias != "" { + this.ClusterAlias = alias + } +} diff --git a/go/vt/orchestrator/inst/cluster_alias.go b/go/vt/orchestrator/inst/cluster_alias.go new file mode 100644 index 0000000000..ccb3a1bf0b --- /dev/null +++ b/go/vt/orchestrator/inst/cluster_alias.go @@ -0,0 +1,35 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +// SetClusterAlias will write (and override) a single cluster name mapping +func SetClusterAlias(clusterName string, alias string) error { + return writeClusterAlias(clusterName, alias) +} + +// SetClusterAliasManualOverride will write (and override) a single cluster name mapping +func SetClusterAliasManualOverride(clusterName string, alias string) error { + return writeClusterAliasManualOverride(clusterName, alias) +} + +// GetClusterByAlias returns the cluster name associated with given alias. +// The function returns with error when: +// - No cluster is associated with the alias +// - More than one cluster is associated with the alias +func GetClusterByAlias(alias string) (string, error) { + return ReadClusterNameByAlias(alias) +} diff --git a/go/vt/orchestrator/inst/cluster_alias_dao.go b/go/vt/orchestrator/inst/cluster_alias_dao.go new file mode 100644 index 0000000000..fa60033690 --- /dev/null +++ b/go/vt/orchestrator/inst/cluster_alias_dao.go @@ -0,0 +1,226 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// ReadClusterNameByAlias +func ReadClusterNameByAlias(alias string) (clusterName string, err error) { + query := ` + select + cluster_name + from + cluster_alias + where + alias = ? + or cluster_name = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(alias, alias), func(m sqlutils.RowMap) error { + clusterName = m.GetString("cluster_name") + return nil + }) + if err != nil { + return "", err + } + if clusterName == "" { + err = fmt.Errorf("No cluster found for alias %s", alias) + } + return clusterName, err +} + +// DeduceClusterName attempts to resolve a cluster name given a name or alias. +// If unsuccessful to match by alias, the function returns the same given string +func DeduceClusterName(nameOrAlias string) (clusterName string, err error) { + if nameOrAlias == "" { + return "", fmt.Errorf("empty cluster name") + } + if name, err := ReadClusterNameByAlias(nameOrAlias); err == nil { + return name, nil + } + return nameOrAlias, nil +} + +// ReadAliasByClusterName returns the cluster alias for the given cluster name, +// or the cluster name itself if not explicit alias found +func ReadAliasByClusterName(clusterName string) (alias string, err error) { + alias = clusterName // default return value + query := ` + select + alias + from + cluster_alias + where + cluster_name = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error { + alias = m.GetString("alias") + return nil + }) + return clusterName, err +} + +// WriteClusterAlias will write (and override) a single cluster name mapping +func writeClusterAlias(clusterName string, alias string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + replace into + cluster_alias (cluster_name, alias, last_registered) + values + (?, ?, now()) + `, + clusterName, alias) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// writeClusterAliasManualOverride will write (and override) a single cluster name mapping +func writeClusterAliasManualOverride(clusterName string, alias string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + replace into + cluster_alias_override (cluster_name, alias) + values + (?, ?) + `, + clusterName, alias) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// UpdateClusterAliases writes down the cluster_alias table based on information +// gained from database_instance +func UpdateClusterAliases() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + replace into + cluster_alias (alias, cluster_name, last_registered) + select + suggested_cluster_alias, + cluster_name, + now() + from + database_instance + left join database_instance_downtime using (hostname, port) + where + suggested_cluster_alias!='' + /* exclude newly demoted, downtimed masters */ + and ifnull( + database_instance_downtime.downtime_active = 1 + and database_instance_downtime.end_timestamp > now() + and database_instance_downtime.reason = ? + , 0) = 0 + order by + ifnull(last_checked <= last_seen, 0) asc, + read_only desc, + num_slave_hosts asc + `, DowntimeLostInRecoveryMessage) + return log.Errore(err) + } + if err := ExecDBWriteFunc(writeFunc); err != nil { + return err + } + writeFunc = func() error { + // Handling the case where no cluster alias exists: we write a dummy alias in the form of the real cluster name. + _, err := db.ExecOrchestrator(` + replace into + cluster_alias (alias, cluster_name, last_registered) + select + cluster_name as alias, cluster_name, now() + from + database_instance + group by + cluster_name + having + sum(suggested_cluster_alias = '') = count(*) + `) + return log.Errore(err) + } + if err := ExecDBWriteFunc(writeFunc); err != nil { + return err + } + return nil +} + +// ReplaceAliasClusterName replaces alis mapping of one cluster name onto a new cluster name. +// Used in topology failover/recovery +func ReplaceAliasClusterName(oldClusterName string, newClusterName string) (err error) { + { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update cluster_alias + set cluster_name = ? + where cluster_name = ? + `, + newClusterName, oldClusterName) + return log.Errore(err) + } + err = ExecDBWriteFunc(writeFunc) + } + { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update cluster_alias_override + set cluster_name = ? + where cluster_name = ? + `, + newClusterName, oldClusterName) + return log.Errore(err) + } + if ferr := ExecDBWriteFunc(writeFunc); ferr != nil { + err = ferr + } + } + return err +} + +// ReadUnambiguousSuggestedClusterAliases reads potential master hostname:port who have suggested cluster aliases, +// where no one else shares said suggested cluster alias. Such hostname:port are likely true owners +// of the alias. +func ReadUnambiguousSuggestedClusterAliases() (result map[string]InstanceKey, err error) { + result = map[string]InstanceKey{} + + query := ` + select + suggested_cluster_alias, + min(hostname) as hostname, + min(port) as port + from + database_instance + where + suggested_cluster_alias != '' + and replication_depth=0 + group by + suggested_cluster_alias + having + count(*) = 1 + ` + err = db.QueryOrchestrator(query, sqlutils.Args(), func(m sqlutils.RowMap) error { + key := InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} + suggestedAlias := m.GetString("suggested_cluster_alias") + result[suggestedAlias] = key + return nil + }) + return result, err +} diff --git a/go/vt/orchestrator/inst/cluster_domain_dao.go b/go/vt/orchestrator/inst/cluster_domain_dao.go new file mode 100644 index 0000000000..74f8c9c79e --- /dev/null +++ b/go/vt/orchestrator/inst/cluster_domain_dao.go @@ -0,0 +1,54 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// WriteClusterDomainName will write (and override) the domain name of a cluster +func WriteClusterDomainName(clusterName string, domainName string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into + cluster_domain_name (cluster_name, domain_name, last_registered) + values + (?, ?, NOW()) + on duplicate key update + domain_name=values(domain_name), + last_registered=values(last_registered) + `, + clusterName, domainName) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// ExpireClusterDomainName expires cluster_domain_name entries that haven't been updated recently. +func ExpireClusterDomainName() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from cluster_domain_name + where last_registered < NOW() - INTERVAL ? MINUTE + `, config.Config.ExpiryHostnameResolvesMinutes, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} diff --git a/go/vt/orchestrator/inst/cluster_test.go b/go/vt/orchestrator/inst/cluster_test.go new file mode 100644 index 0000000000..27b7f212aa --- /dev/null +++ b/go/vt/orchestrator/inst/cluster_test.go @@ -0,0 +1,84 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +var masterKey = InstanceKey{Hostname: "host1", Port: 3306} + +func init() { + config.Config.HostnameResolveMethod = "none" + config.Config.KVClusterMasterPrefix = "test/master/" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestGetClusterMasterKVKey(t *testing.T) { + kvKey := GetClusterMasterKVKey("foo") + test.S(t).ExpectEquals(kvKey, "test/master/foo") +} + +func TestGetClusterMasterKVPair(t *testing.T) { + { + kvPair := getClusterMasterKVPair("myalias", &masterKey) + test.S(t).ExpectNotNil(kvPair) + test.S(t).ExpectEquals(kvPair.Key, "test/master/myalias") + test.S(t).ExpectEquals(kvPair.Value, masterKey.StringCode()) + } + { + kvPair := getClusterMasterKVPair("", &masterKey) + test.S(t).ExpectTrue(kvPair == nil) + } + { + kvPair := getClusterMasterKVPair("myalias", nil) + test.S(t).ExpectTrue(kvPair == nil) + } +} + +func TestGetClusterMasterKVPairs(t *testing.T) { + kvPairs := GetClusterMasterKVPairs("myalias", &masterKey) + test.S(t).ExpectTrue(len(kvPairs) >= 2) + + { + kvPair := kvPairs[0] + test.S(t).ExpectEquals(kvPair.Key, "test/master/myalias") + test.S(t).ExpectEquals(kvPair.Value, masterKey.StringCode()) + } + { + kvPair := kvPairs[1] + test.S(t).ExpectEquals(kvPair.Key, "test/master/myalias/hostname") + test.S(t).ExpectEquals(kvPair.Value, masterKey.Hostname) + } + { + kvPair := kvPairs[2] + test.S(t).ExpectEquals(kvPair.Key, "test/master/myalias/port") + test.S(t).ExpectEquals(kvPair.Value, fmt.Sprintf("%d", masterKey.Port)) + } +} + +func TestGetClusterMasterKVPairs2(t *testing.T) { + kvPairs := GetClusterMasterKVPairs("", &masterKey) + test.S(t).ExpectEquals(len(kvPairs), 0) +} diff --git a/go/vt/orchestrator/inst/downtime.go b/go/vt/orchestrator/inst/downtime.go new file mode 100644 index 0000000000..90ab2ece22 --- /dev/null +++ b/go/vt/orchestrator/inst/downtime.go @@ -0,0 +1,52 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "time" +) + +type Downtime struct { + Key *InstanceKey + Owner string + Reason string + Duration time.Duration + BeginsAt time.Time + EndsAt time.Time + BeginsAtString string + EndsAtString string +} + +func NewDowntime(instanceKey *InstanceKey, owner string, reason string, duration time.Duration) *Downtime { + downtime := &Downtime{ + Key: instanceKey, + Owner: owner, + Reason: reason, + Duration: duration, + BeginsAt: time.Now(), + } + downtime.EndsAt = downtime.BeginsAt.Add(downtime.Duration) + return downtime +} + +func (downtime *Downtime) Ended() bool { + return downtime.EndsAt.Before(time.Now()) +} + +func (downtime *Downtime) EndsIn() time.Duration { + return downtime.EndsAt.Sub(time.Now()) +} diff --git a/go/vt/orchestrator/inst/downtime_dao.go b/go/vt/orchestrator/inst/downtime_dao.go new file mode 100644 index 0000000000..20cd4c79b6 --- /dev/null +++ b/go/vt/orchestrator/inst/downtime_dao.go @@ -0,0 +1,240 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// BeginDowntime will make mark an instance as downtimed (or override existing downtime period) +func BeginDowntime(downtime *Downtime) (err error) { + if downtime.Duration == 0 { + downtime.Duration = config.MaintenanceExpireMinutes * time.Minute + } + if downtime.EndsAtString != "" { + _, err = db.ExecOrchestrator(` + insert + into database_instance_downtime ( + hostname, port, downtime_active, begin_timestamp, end_timestamp, owner, reason + ) VALUES ( + ?, ?, 1, ?, ?, ?, ? + ) + on duplicate key update + downtime_active=values(downtime_active), + begin_timestamp=values(begin_timestamp), + end_timestamp=values(end_timestamp), + owner=values(owner), + reason=values(reason) + `, + downtime.Key.Hostname, + downtime.Key.Port, + downtime.BeginsAtString, + downtime.EndsAtString, + downtime.Owner, + downtime.Reason, + ) + } else { + if downtime.Ended() { + // No point in writing it down; it's expired + return nil + } + + _, err = db.ExecOrchestrator(` + insert + into database_instance_downtime ( + hostname, port, downtime_active, begin_timestamp, end_timestamp, owner, reason + ) VALUES ( + ?, ?, 1, NOW(), NOW() + INTERVAL ? SECOND, ?, ? + ) + on duplicate key update + downtime_active=values(downtime_active), + begin_timestamp=values(begin_timestamp), + end_timestamp=values(end_timestamp), + owner=values(owner), + reason=values(reason) + `, + downtime.Key.Hostname, + downtime.Key.Port, + int(downtime.EndsIn().Seconds()), + downtime.Owner, + downtime.Reason, + ) + } + if err != nil { + return log.Errore(err) + } + AuditOperation("begin-downtime", downtime.Key, fmt.Sprintf("owner: %s, reason: %s", downtime.Owner, downtime.Reason)) + + return nil +} + +// EndDowntime will remove downtime flag from an instance +func EndDowntime(instanceKey *InstanceKey) (wasDowntimed bool, err error) { + res, err := db.ExecOrchestrator(` + delete from + database_instance_downtime + where + hostname = ? + and port = ? + `, + instanceKey.Hostname, + instanceKey.Port, + ) + if err != nil { + return wasDowntimed, log.Errore(err) + } + + if affected, _ := res.RowsAffected(); affected > 0 { + wasDowntimed = true + AuditOperation("end-downtime", instanceKey, "") + } + return wasDowntimed, err +} + +// renewLostInRecoveryDowntime renews hosts who are downtimed due to being lost in recovery, such that +// their downtime never expires. +func renewLostInRecoveryDowntime() error { + _, err := db.ExecOrchestrator(` + update + database_instance_downtime + set + end_timestamp = NOW() + INTERVAL ? SECOND + where + end_timestamp > NOW() + and reason = ? + `, + config.LostInRecoveryDowntimeSeconds, + DowntimeLostInRecoveryMessage, + ) + + return err +} + +// expireLostInRecoveryDowntime expires downtime for servers who have been lost in recovery in the last, +// but are now replicating. +func expireLostInRecoveryDowntime() error { + instances, err := ReadLostInRecoveryInstances("") + if err != nil { + return err + } + if len(instances) == 0 { + return nil + } + unambiguousAliases, err := ReadUnambiguousSuggestedClusterAliases() + if err != nil { + return err + } + for _, instance := range instances { + // We _may_ expire this downtime, but only after a minute + // This is a graceful period, during which other servers can claim ownership of the alias, + // or can update their own cluster name to match a new master's name + if instance.ElapsedDowntime < time.Minute { + continue + } + if !instance.IsLastCheckValid { + continue + } + endDowntime := false + if instance.ReplicaRunning() { + // back, alive, replicating in some topology + endDowntime = true + } else if instance.ReplicationDepth == 0 { + // instance makes the appearance of a master + if unambiguousKey, ok := unambiguousAliases[instance.SuggestedClusterAlias]; ok { + if unambiguousKey.Equals(&instance.Key) { + // This instance seems to be a master, which is valid, and has a suggested alias, + // and is the _only_ one to have this suggested alias (i.e. no one took its place) + endDowntime = true + } + } + } + if endDowntime { + if _, err := EndDowntime(&instance.Key); err != nil { + return err + } + } + } + return nil +} + +// ExpireDowntime will remove the maintenance flag on old downtimes +func ExpireDowntime() error { + if err := renewLostInRecoveryDowntime(); err != nil { + return log.Errore(err) + } + if err := expireLostInRecoveryDowntime(); err != nil { + return log.Errore(err) + } + { + res, err := db.ExecOrchestrator(` + delete from + database_instance_downtime + where + end_timestamp < NOW() + `, + ) + if err != nil { + return log.Errore(err) + } + if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { + AuditOperation("expire-downtime", nil, fmt.Sprintf("Expired %d entries", rowsAffected)) + } + } + + return nil +} + +func ReadDowntime() (result []Downtime, err error) { + query := ` + select + hostname, + port, + begin_timestamp, + end_timestamp, + owner, + reason + from + database_instance_downtime + where + end_timestamp > now() + ` + err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + downtime := Downtime{ + Key: &InstanceKey{}, + } + downtime.Key.Hostname = m.GetString("hostname") + downtime.Key.Port = m.GetInt("port") + downtime.BeginsAt = m.GetTime("begin_timestamp") + downtime.EndsAt = m.GetTime("end_timestamp") + downtime.BeginsAtString = m.GetString("begin_timestamp") + downtime.EndsAtString = m.GetString("end_timestamp") + downtime.Owner = m.GetString("owner") + downtime.Reason = m.GetString("reason") + + downtime.Duration = downtime.EndsAt.Sub(downtime.BeginsAt) + + result = append(result, downtime) + return nil + }) + return result, log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/instance.go b/go/vt/orchestrator/inst/instance.go new file mode 100644 index 0000000000..7bd2638934 --- /dev/null +++ b/go/vt/orchestrator/inst/instance.go @@ -0,0 +1,641 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "database/sql" + "encoding/json" + "fmt" + "strconv" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/math" +) + +const ReasonableDiscoveryLatency = 500 * time.Millisecond + +// Instance represents a database instance, including its current configuration & status. +// It presents important replication configuration and detailed replication status. +type Instance struct { + Key InstanceKey + InstanceAlias string + Uptime uint + ServerID uint + ServerUUID string + Version string + VersionComment string + FlavorName string + ReadOnly bool + Binlog_format string + BinlogRowImage string + LogBinEnabled bool + LogSlaveUpdatesEnabled bool // for API backwards compatibility. Equals `LogReplicationUpdatesEnabled` + LogReplicationUpdatesEnabled bool + SelfBinlogCoordinates BinlogCoordinates + MasterKey InstanceKey + MasterUUID string + AncestryUUID string + IsDetachedMaster bool + + Slave_SQL_Running bool // for API backwards compatibility. Equals `ReplicationSQLThreadRuning` + ReplicationSQLThreadRuning bool + Slave_IO_Running bool // for API backwards compatibility. Equals `ReplicationIOThreadRuning` + ReplicationIOThreadRuning bool + ReplicationSQLThreadState ReplicationThreadState + ReplicationIOThreadState ReplicationThreadState + + HasReplicationFilters bool + GTIDMode string + SupportsOracleGTID bool + UsingOracleGTID bool + UsingMariaDBGTID bool + UsingPseudoGTID bool + ReadBinlogCoordinates BinlogCoordinates + ExecBinlogCoordinates BinlogCoordinates + IsDetached bool + RelaylogCoordinates BinlogCoordinates + LastSQLError string + LastIOError string + SecondsBehindMaster sql.NullInt64 + SQLDelay uint + ExecutedGtidSet string + GtidPurged string + GtidErrant string + + masterExecutedGtidSet string // Not exported + + SlaveLagSeconds sql.NullInt64 // for API backwards compatibility. Equals `ReplicationLagSeconds` + ReplicationLagSeconds sql.NullInt64 + SlaveHosts InstanceKeyMap // for API backwards compatibility. Equals `Replicas` + Replicas InstanceKeyMap + ClusterName string + SuggestedClusterAlias string + DataCenter string + Region string + PhysicalEnvironment string + ReplicationDepth uint + IsCoMaster bool + HasReplicationCredentials bool + ReplicationCredentialsAvailable bool + SemiSyncAvailable bool // when both semi sync plugins (master & replica) are loaded + SemiSyncEnforced bool + SemiSyncMasterEnabled bool + SemiSyncReplicaEnabled bool + SemiSyncMasterTimeout uint64 + SemiSyncMasterWaitForReplicaCount uint + SemiSyncMasterStatus bool + SemiSyncMasterClients uint + SemiSyncReplicaStatus bool + + LastSeenTimestamp string + IsLastCheckValid bool + IsUpToDate bool + IsRecentlyChecked bool + SecondsSinceLastSeen sql.NullInt64 + CountMySQLSnapshots int + + // Careful. IsCandidate and PromotionRule are used together + // and probably need to be merged. IsCandidate's value may + // be picked up from daabase_candidate_instance's value when + // reading an instance from the db. + IsCandidate bool + PromotionRule CandidatePromotionRule + IsDowntimed bool + DowntimeReason string + DowntimeOwner string + DowntimeEndTimestamp string + ElapsedDowntime time.Duration + UnresolvedHostname string + AllowTLS bool + + Problems []string + + LastDiscoveryLatency time.Duration + + seed bool // Means we force this instance to be written to backend, even if it's invalid, empty or forgotten + + /* All things Group Replication below */ + + // Group replication global variables + ReplicationGroupName string + ReplicationGroupIsSinglePrimary bool + + // Replication group members information. See + // https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for details. + ReplicationGroupMemberState string + ReplicationGroupMemberRole string + + // List of all known members of the same group + ReplicationGroupMembers InstanceKeyMap + + // Primary of the replication group + ReplicationGroupPrimaryInstanceKey InstanceKey +} + +// NewInstance creates a new, empty instance + +func NewInstance() *Instance { + return &Instance{ + Replicas: make(map[InstanceKey]bool), + ReplicationGroupMembers: make(map[InstanceKey]bool), + Problems: []string{}, + } +} + +func (this *Instance) MarshalJSON() ([]byte, error) { + i := struct { + Instance + }{} + i.Instance = *this + // change terminology. Users of the orchestrator API can switch to new terminology and avoid using old terminology + // flip + i.SlaveHosts = i.Replicas + i.SlaveLagSeconds = this.ReplicationLagSeconds + i.LogSlaveUpdatesEnabled = this.LogReplicationUpdatesEnabled + i.Slave_SQL_Running = this.ReplicationSQLThreadRuning + i.Slave_IO_Running = this.ReplicationIOThreadRuning + + return json.Marshal(i) +} + +// Equals tests that this instance is the same instance as other. The function does not test +// configuration or status. +func (this *Instance) Equals(other *Instance) bool { + return this.Key == other.Key +} + +// MajorVersion returns this instance's major version number (e.g. for 5.5.36 it returns "5.5") +func (this *Instance) MajorVersion() []string { + return MajorVersion(this.Version) +} + +// MajorVersion returns this instance's major version number (e.g. for 5.5.36 it returns "5.5") +func (this *Instance) MajorVersionString() string { + return strings.Join(this.MajorVersion(), ".") +} + +func (this *Instance) IsMySQL51() bool { + return this.MajorVersionString() == "5.1" +} + +func (this *Instance) IsMySQL55() bool { + return this.MajorVersionString() == "5.5" +} + +func (this *Instance) IsMySQL56() bool { + return this.MajorVersionString() == "5.6" +} + +func (this *Instance) IsMySQL57() bool { + return this.MajorVersionString() == "5.7" +} + +func (this *Instance) IsMySQL80() bool { + return this.MajorVersionString() == "8.0" +} + +// IsSmallerBinlogFormat returns true when this instance's binlgo format is +// "smaller" than the other's, i.e. binary logs cannot flow from the other instance to this one +func (this *Instance) IsSmallerBinlogFormat(other *Instance) bool { + return IsSmallerBinlogFormat(this.Binlog_format, other.Binlog_format) +} + +// IsSmallerMajorVersion tests this instance against another and returns true if this instance is of a smaller "major" varsion. +// e.g. 5.5.36 is NOT a smaller major version as comapred to 5.5.36, but IS as compared to 5.6.9 +func (this *Instance) IsSmallerMajorVersion(other *Instance) bool { + return IsSmallerMajorVersion(this.Version, other.Version) +} + +// IsSmallerMajorVersionByString checks if this instance has a smaller major version number than given one +func (this *Instance) IsSmallerMajorVersionByString(otherVersion string) bool { + return IsSmallerMajorVersion(this.Version, otherVersion) +} + +// IsMariaDB checks whether this is any version of MariaDB +func (this *Instance) IsMariaDB() bool { + return strings.Contains(this.Version, "MariaDB") +} + +// IsPercona checks whether this is any version of Percona Server +func (this *Instance) IsPercona() bool { + return strings.Contains(this.VersionComment, "Percona") +} + +// isMaxScale checks whether this is any version of MaxScale +func (this *Instance) isMaxScale() bool { + return strings.Contains(this.Version, "maxscale") +} + +// isNDB check whether this is NDB Cluster (aka MySQL Cluster) +func (this *Instance) IsNDB() bool { + return strings.Contains(this.Version, "-ndb-") +} + +// IsReplicationGroup checks whether the host thinks it is part of a known replication group. Notice that this might +// return True even if the group has decided to expel the member represented by this instance, as the instance might not +// know that under certain circumstances +func (this *Instance) IsReplicationGroupMember() bool { + return this.ReplicationGroupName != "" +} + +func (this *Instance) IsReplicationGroupPrimary() bool { + return this.IsReplicationGroupMember() && this.ReplicationGroupPrimaryInstanceKey.Equals(&this.Key) +} + +func (this *Instance) IsReplicationGroupSecondary() bool { + return this.IsReplicationGroupMember() && !this.ReplicationGroupPrimaryInstanceKey.Equals(&this.Key) +} + +// IsBinlogServer checks whether this is any type of a binlog server (currently only maxscale) +func (this *Instance) IsBinlogServer() bool { + if this.isMaxScale() { + return true + } + return false +} + +// IsOracleMySQL checks whether this is an Oracle MySQL distribution +func (this *Instance) IsOracleMySQL() bool { + if this.IsMariaDB() { + return false + } + if this.IsPercona() { + return false + } + if this.isMaxScale() { + return false + } + if this.IsBinlogServer() { + return false + } + return true +} + +func (this *Instance) SetSeed() { + this.seed = true +} +func (this *Instance) IsSeed() bool { + return this.seed +} + +// applyFlavorName +func (this *Instance) applyFlavorName() { + if this == nil { + return + } + if this.IsOracleMySQL() { + this.FlavorName = "MySQL" + } else if this.IsMariaDB() { + this.FlavorName = "MariaDB" + } else if this.IsPercona() { + this.FlavorName = "Percona" + } else if this.isMaxScale() { + this.FlavorName = "MaxScale" + } else { + this.FlavorName = "unknown" + } +} + +// FlavorNameAndMajorVersion returns a string of the combined +// flavor and major version which is useful in some checks. +func (this *Instance) FlavorNameAndMajorVersion() string { + if this.FlavorName == "" { + this.applyFlavorName() + } + + return this.FlavorName + "-" + this.MajorVersionString() +} + +// IsReplica makes simple heuristics to decide whether this instance is a replica of another instance +func (this *Instance) IsReplica() bool { + return this.MasterKey.Hostname != "" && this.MasterKey.Hostname != "_" && this.MasterKey.Port != 0 && (this.ReadBinlogCoordinates.LogFile != "" || this.UsingGTID()) +} + +// IsMaster makes simple heuristics to decide whether this instance is a master (not replicating from any other server), +// either via traditional async/semisync replication or group replication +func (this *Instance) IsMaster() bool { + // If traditional replication is configured, it is for sure not a master + if this.IsReplica() { + return false + } + // If traditional replication is not configured, and it is also not part of a replication group, this host is + // a master + if !this.IsReplicationGroupMember() { + return true + } + // If traditional replication is not configured, and this host is part of a group, it is only considered a + // master if it has the role of group Primary. Otherwise it is not a master. + if this.ReplicationGroupMemberRole == GroupReplicationMemberRolePrimary { + return true + } + return false +} + +// ReplicaRunning returns true when this instance's status is of a replicating replica. +func (this *Instance) ReplicaRunning() bool { + return this.IsReplica() && this.ReplicationSQLThreadState.IsRunning() && this.ReplicationIOThreadState.IsRunning() +} + +// NoReplicationThreadRunning returns true when neither SQL nor IO threads are running (including the case where isn't even a replica) +func (this *Instance) ReplicationThreadsStopped() bool { + return this.ReplicationSQLThreadState.IsStopped() && this.ReplicationIOThreadState.IsStopped() +} + +// NoReplicationThreadRunning returns true when neither SQL nor IO threads are running (including the case where isn't even a replica) +func (this *Instance) ReplicationThreadsExist() bool { + return this.ReplicationSQLThreadState.Exists() && this.ReplicationIOThreadState.Exists() +} + +// SQLThreadUpToDate returns true when the instance had consumed all relay logs. +func (this *Instance) SQLThreadUpToDate() bool { + return this.ReadBinlogCoordinates.Equals(&this.ExecBinlogCoordinates) +} + +// UsingGTID returns true when this replica is currently replicating via GTID (either Oracle or MariaDB) +func (this *Instance) UsingGTID() bool { + return this.UsingOracleGTID || this.UsingMariaDBGTID +} + +// NextGTID returns the next (Oracle) GTID to be executed. Useful for skipping queries +func (this *Instance) NextGTID() (string, error) { + if this.ExecutedGtidSet == "" { + return "", fmt.Errorf("No value found in Executed_Gtid_Set; cannot compute NextGTID") + } + + firstToken := func(s string, delimiter string) string { + tokens := strings.Split(s, delimiter) + return tokens[0] + } + lastToken := func(s string, delimiter string) string { + tokens := strings.Split(s, delimiter) + return tokens[len(tokens)-1] + } + // executed GTID set: 4f6d62ed-df65-11e3-b395-60672090eb04:1,b9b4712a-df64-11e3-b391-60672090eb04:1-6 + executedGTIDsFromMaster := lastToken(this.ExecutedGtidSet, ",") + // executedGTIDsFromMaster: b9b4712a-df64-11e3-b391-60672090eb04:1-6 + executedRange := lastToken(executedGTIDsFromMaster, ":") + // executedRange: 1-6 + lastExecutedNumberToken := lastToken(executedRange, "-") + // lastExecutedNumber: 6 + lastExecutedNumber, err := strconv.Atoi(lastExecutedNumberToken) + if err != nil { + return "", err + } + nextNumber := lastExecutedNumber + 1 + nextGTID := fmt.Sprintf("%s:%d", firstToken(executedGTIDsFromMaster, ":"), nextNumber) + return nextGTID, nil +} + +// AddReplicaKey adds a replica to the list of this instance's replicas. +func (this *Instance) AddReplicaKey(replicaKey *InstanceKey) { + this.Replicas.AddKey(*replicaKey) +} + +// AddGroupMemberKey adds a group member to the list of this instance's group members. +func (this *Instance) AddGroupMemberKey(groupMemberKey *InstanceKey) { + this.ReplicationGroupMembers.AddKey(*groupMemberKey) +} + +// GetNextBinaryLog returns the successive, if any, binary log file to the one given +func (this *Instance) GetNextBinaryLog(binlogCoordinates BinlogCoordinates) (BinlogCoordinates, error) { + if binlogCoordinates.LogFile == this.SelfBinlogCoordinates.LogFile { + return binlogCoordinates, fmt.Errorf("Cannot find next binary log for %+v", binlogCoordinates) + } + return binlogCoordinates.NextFileCoordinates() +} + +// IsReplicaOf returns true if this instance claims to replicate from given master +func (this *Instance) IsReplicaOf(master *Instance) bool { + return this.MasterKey.Equals(&master.Key) +} + +// IsReplicaOf returns true if this i supposed master of given replica +func (this *Instance) IsMasterOf(replica *Instance) bool { + return replica.IsReplicaOf(this) +} + +// IsDescendantOf returns true if this is replication directly or indirectly from other +func (this *Instance) IsDescendantOf(other *Instance) bool { + for _, uuid := range strings.Split(this.AncestryUUID, ",") { + if uuid == other.ServerUUID && uuid != "" { + return true + } + } + return false +} + +// CanReplicateFrom uses heursitics to decide whether this instacne can practically replicate from other instance. +// Checks are made to binlog format, version number, binary logs etc. +func (this *Instance) CanReplicateFrom(other *Instance) (bool, error) { + if this.Key.Equals(&other.Key) { + return false, fmt.Errorf("instance cannot replicate from itself: %+v", this.Key) + } + if !other.LogBinEnabled { + return false, fmt.Errorf("instance does not have binary logs enabled: %+v", other.Key) + } + if other.IsReplica() { + if !other.LogReplicationUpdatesEnabled { + return false, fmt.Errorf("instance does not have log_slave_updates enabled: %+v", other.Key) + } + // OK for a master to not have log_slave_updates + // Not OK for a replica, for it has to relay the logs. + } + if this.IsSmallerMajorVersion(other) && !this.IsBinlogServer() { + return false, fmt.Errorf("instance %+v has version %s, which is lower than %s on %+v ", this.Key, this.Version, other.Version, other.Key) + } + if this.LogBinEnabled && this.LogReplicationUpdatesEnabled { + if this.IsSmallerBinlogFormat(other) { + return false, fmt.Errorf("Cannot replicate from %+v binlog format on %+v to %+v on %+v", other.Binlog_format, other.Key, this.Binlog_format, this.Key) + } + } + if config.Config.VerifyReplicationFilters { + if other.HasReplicationFilters && !this.HasReplicationFilters { + return false, fmt.Errorf("%+v has replication filters", other.Key) + } + } + if this.ServerID == other.ServerID && !this.IsBinlogServer() { + return false, fmt.Errorf("Identical server id: %+v, %+v both have %d", other.Key, this.Key, this.ServerID) + } + if this.ServerUUID == other.ServerUUID && this.ServerUUID != "" && !this.IsBinlogServer() { + return false, fmt.Errorf("Identical server UUID: %+v, %+v both have %s", other.Key, this.Key, this.ServerUUID) + } + if this.SQLDelay < other.SQLDelay && int64(other.SQLDelay) > int64(config.Config.ReasonableMaintenanceReplicationLagSeconds) { + return false, fmt.Errorf("%+v has higher SQL_Delay (%+v seconds) than %+v does (%+v seconds)", other.Key, other.SQLDelay, this.Key, this.SQLDelay) + } + return true, nil +} + +// HasReasonableMaintenanceReplicationLag returns true when the replica lag is reasonable, and maintenance operations should have a green light to go. +func (this *Instance) HasReasonableMaintenanceReplicationLag() bool { + // replicas with SQLDelay are a special case + if this.SQLDelay > 0 { + return math.AbsInt64(this.SecondsBehindMaster.Int64-int64(this.SQLDelay)) <= int64(config.Config.ReasonableMaintenanceReplicationLagSeconds) + } + return this.SecondsBehindMaster.Int64 <= int64(config.Config.ReasonableMaintenanceReplicationLagSeconds) +} + +// CanMove returns true if this instance's state allows it to be repositioned. For example, +// if this instance lags too much, it will not be moveable. +func (this *Instance) CanMove() (bool, error) { + if !this.IsLastCheckValid { + return false, fmt.Errorf("%+v: last check invalid", this.Key) + } + if !this.IsRecentlyChecked { + return false, fmt.Errorf("%+v: not recently checked", this.Key) + } + if !this.ReplicationSQLThreadState.IsRunning() { + return false, fmt.Errorf("%+v: instance is not replicating", this.Key) + } + if !this.ReplicationIOThreadState.IsRunning() { + return false, fmt.Errorf("%+v: instance is not replicating", this.Key) + } + if !this.SecondsBehindMaster.Valid { + return false, fmt.Errorf("%+v: cannot determine replication lag", this.Key) + } + if !this.HasReasonableMaintenanceReplicationLag() { + return false, fmt.Errorf("%+v: lags too much", this.Key) + } + return true, nil +} + +// CanMoveAsCoMaster returns true if this instance's state allows it to be repositioned. +func (this *Instance) CanMoveAsCoMaster() (bool, error) { + if !this.IsLastCheckValid { + return false, fmt.Errorf("%+v: last check invalid", this.Key) + } + if !this.IsRecentlyChecked { + return false, fmt.Errorf("%+v: not recently checked", this.Key) + } + return true, nil +} + +// CanMoveViaMatch returns true if this instance's state allows it to be repositioned via pseudo-GTID matching +func (this *Instance) CanMoveViaMatch() (bool, error) { + if !this.IsLastCheckValid { + return false, fmt.Errorf("%+v: last check invalid", this.Key) + } + if !this.IsRecentlyChecked { + return false, fmt.Errorf("%+v: not recently checked", this.Key) + } + return true, nil +} + +// StatusString returns a human readable description of this instance's status +func (this *Instance) StatusString() string { + if !this.IsLastCheckValid { + return "invalid" + } + if !this.IsRecentlyChecked { + return "unchecked" + } + if this.IsReplica() && !this.ReplicaRunning() { + return "nonreplicating" + } + if this.IsReplica() && !this.HasReasonableMaintenanceReplicationLag() { + return "lag" + } + return "ok" +} + +// LagStatusString returns a human readable representation of current lag +func (this *Instance) LagStatusString() string { + if this.IsDetached { + return "detached" + } + if !this.IsLastCheckValid { + return "unknown" + } + if !this.IsRecentlyChecked { + return "unknown" + } + if this.IsReplica() && !this.ReplicaRunning() { + return "null" + } + if this.IsReplica() && !this.SecondsBehindMaster.Valid { + return "null" + } + if this.IsReplica() && this.ReplicationLagSeconds.Int64 > int64(config.Config.ReasonableMaintenanceReplicationLagSeconds) { + return fmt.Sprintf("%+vs", this.ReplicationLagSeconds.Int64) + } + return fmt.Sprintf("%+vs", this.ReplicationLagSeconds.Int64) +} + +func (this *Instance) descriptionTokens() (tokens []string) { + tokens = append(tokens, this.LagStatusString()) + tokens = append(tokens, this.StatusString()) + tokens = append(tokens, this.Version) + if this.ReadOnly { + tokens = append(tokens, "ro") + } else { + tokens = append(tokens, "rw") + } + if this.LogBinEnabled { + tokens = append(tokens, this.Binlog_format) + } else { + tokens = append(tokens, "nobinlog") + } + { + extraTokens := []string{} + if this.LogBinEnabled && this.LogReplicationUpdatesEnabled { + extraTokens = append(extraTokens, ">>") + } + if this.UsingGTID() || this.SupportsOracleGTID { + token := "GTID" + if this.GtidErrant != "" { + token = fmt.Sprintf("%s:errant", token) + } + extraTokens = append(extraTokens, token) + } + if this.UsingPseudoGTID { + extraTokens = append(extraTokens, "P-GTID") + } + if this.SemiSyncMasterStatus { + extraTokens = append(extraTokens, "semi:master") + } + if this.SemiSyncReplicaStatus { + extraTokens = append(extraTokens, "semi:replica") + } + if this.IsDowntimed { + extraTokens = append(extraTokens, "downtimed") + } + tokens = append(tokens, strings.Join(extraTokens, ",")) + } + return tokens +} + +// HumanReadableDescription returns a simple readable string describing the status, version, +// etc. properties of this instance +func (this *Instance) HumanReadableDescription() string { + tokens := this.descriptionTokens() + nonEmptyTokens := []string{} + for _, token := range tokens { + if token != "" { + nonEmptyTokens = append(nonEmptyTokens, token) + } + } + description := fmt.Sprintf("[%s]", strings.Join(nonEmptyTokens, ",")) + return description +} + +// TabulatedDescription returns a simple tabulated string of various properties +func (this *Instance) TabulatedDescription(separator string) string { + tokens := this.descriptionTokens() + description := fmt.Sprintf("%s", strings.Join(tokens, separator)) + return description +} diff --git a/go/vt/orchestrator/inst/instance_binlog.go b/go/vt/orchestrator/inst/instance_binlog.go new file mode 100644 index 0000000000..267cfd7ba0 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_binlog.go @@ -0,0 +1,191 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "errors" + "regexp" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// Event entries may contains table IDs (can be different for same tables on different servers) +// and also COMMIT transaction IDs (different values on different servers). +// So these need to be removed from the event entry if we're to compare and validate matching +// entries. +var eventInfoTransformations map[*regexp.Regexp]string = map[*regexp.Regexp]string{ + regexp.MustCompile(`(.*) [/][*].*?[*][/](.*$)`): "$1 $2", // strip comments + regexp.MustCompile(`(COMMIT) .*$`): "$1", // commit number varies cross servers + regexp.MustCompile(`(table_id:) [0-9]+ (.*$)`): "$1 ### $2", // table ids change cross servers + regexp.MustCompile(`(table_id:) [0-9]+$`): "$1 ###", // table ids change cross servers + regexp.MustCompile(` X'([0-9a-fA-F]+)' COLLATE`): " 0x$1 COLLATE", // different ways to represent collate + regexp.MustCompile(`(BEGIN GTID [^ ]+) cid=.*`): "$1", // MariaDB GTID someimtes gets addition of "cid=...". Stripping +} + +var skippedEventTypes map[string]bool = map[string]bool{ + "Format_desc": true, + "Stop": true, + "Rotate": true, +} + +type BinlogEvent struct { + Coordinates BinlogCoordinates + NextEventPos int64 + EventType string + Info string +} + +// +func (this *BinlogEvent) NextBinlogCoordinates() BinlogCoordinates { + return BinlogCoordinates{LogFile: this.Coordinates.LogFile, LogPos: this.NextEventPos, Type: this.Coordinates.Type} +} + +// +func (this *BinlogEvent) NormalizeInfo() { + for reg, replace := range eventInfoTransformations { + this.Info = reg.ReplaceAllString(this.Info, replace) + } +} + +func (this *BinlogEvent) Equals(other *BinlogEvent) bool { + return this.Coordinates.Equals(&other.Coordinates) && + this.NextEventPos == other.NextEventPos && + this.EventType == other.EventType && this.Info == other.Info +} + +func (this *BinlogEvent) EqualsIgnoreCoordinates(other *BinlogEvent) bool { + return this.NextEventPos == other.NextEventPos && + this.EventType == other.EventType && this.Info == other.Info +} + +const maxEmptyEventsEvents int = 10 + +// +type BinlogEventCursor struct { + cachedEvents []BinlogEvent + currentEventIndex int + fetchNextEvents func(BinlogCoordinates) ([]BinlogEvent, error) + nextCoordinates BinlogCoordinates +} + +// fetchNextEventsFunc expected to return events starting at a given position, and automatically fetch those from next +// binary log when no more rows are found in current log. +// It is expected to return empty array with no error upon end of binlogs +// It is expected to return error upon error... +func NewBinlogEventCursor(startCoordinates BinlogCoordinates, fetchNextEventsFunc func(BinlogCoordinates) ([]BinlogEvent, error)) BinlogEventCursor { + events, _ := fetchNextEventsFunc(startCoordinates) + var initialNextCoordinates BinlogCoordinates + if len(events) > 0 { + initialNextCoordinates = events[0].NextBinlogCoordinates() + } + return BinlogEventCursor{ + cachedEvents: events, + currentEventIndex: -1, + fetchNextEvents: fetchNextEventsFunc, + nextCoordinates: initialNextCoordinates, + } +} + +// nextEvent will return the next event entry from binary logs; it will automatically skip to next +// binary log if need be. +// Internally, it uses the cachedEvents array, so that it does not go to the MySQL server upon each call. +// Returns nil upon reaching end of binary logs. +func (this *BinlogEventCursor) nextEvent(numEmptyEventsEvents int) (*BinlogEvent, error) { + if numEmptyEventsEvents > maxEmptyEventsEvents { + log.Debugf("End of logs. currentEventIndex: %d, nextCoordinates: %+v", this.currentEventIndex, this.nextCoordinates) + // End of logs + return nil, nil + } + if len(this.cachedEvents) == 0 { + // Cache exhausted; get next bulk of entries and return the next entry + nextFileCoordinates, err := this.nextCoordinates.NextFileCoordinates() + if err != nil { + return nil, err + } + log.Debugf("zero cached events, next file: %+v", nextFileCoordinates) + this.cachedEvents, err = this.fetchNextEvents(nextFileCoordinates) + if err != nil { + return nil, err + } + this.currentEventIndex = -1 + // While this seems recursive do note that recursion level is at most 1, since we either have + // entries in the next binlog (no further recursion) or we don't (immediate termination) + return this.nextEvent(numEmptyEventsEvents + 1) + } + if this.currentEventIndex+1 < len(this.cachedEvents) { + // We have enough cache to go by + this.currentEventIndex++ + event := &this.cachedEvents[this.currentEventIndex] + this.nextCoordinates = event.NextBinlogCoordinates() + return event, nil + } else { + // Cache exhausted; get next bulk of entries and return the next entry + var err error + this.cachedEvents, err = this.fetchNextEvents(this.cachedEvents[len(this.cachedEvents)-1].NextBinlogCoordinates()) + if err != nil { + return nil, err + } + this.currentEventIndex = -1 + // While this seems recursive do note that recursion level is at most 1, since we either have + // entries in the next binlog (no further recursion) or we don't (immediate termination) + return this.nextEvent(numEmptyEventsEvents + 1) + } +} + +// NextRealEvent returns the next event from binlog that is not meta/control event (these are start-of-binary-log, +// rotate-binary-log etc.) +func (this *BinlogEventCursor) nextRealEvent(recursionLevel int) (*BinlogEvent, error) { + if recursionLevel > maxEmptyEventsEvents { + log.Debugf("End of real events") + return nil, nil + } + event, err := this.nextEvent(0) + if err != nil { + return event, err + } + if event == nil { + return event, err + } + + if _, found := skippedEventTypes[event.EventType]; found { + // Recursion will not be deep here. A few entries (end-of-binlog followed by start-of-bin-log) are possible, + // but we really don't expect a huge sequence of those. + return this.nextRealEvent(recursionLevel + 1) + } + for _, skipSubstring := range config.Config.SkipBinlogEventsContaining { + if strings.Index(event.Info, skipSubstring) >= 0 { + // Recursion might go deeper here. + return this.nextRealEvent(recursionLevel + 1) + } + } + event.NormalizeInfo() + return event, err +} + +// NextCoordinates return the binlog coordinates of the next entry as yet unprocessed by the cursor. +// Moreover, when the cursor terminates (consumes last entry), these coordinates indicate what will be the futuristic +// coordinates of the next binlog entry. +// The value of this function is used by match-below to move a replica behind another, after exhausting the shared binlog +// entries of both. +func (this *BinlogEventCursor) getNextCoordinates() (BinlogCoordinates, error) { + if this.nextCoordinates.LogPos == 0 { + return this.nextCoordinates, errors.New("Next coordinates unfound") + } + return this.nextCoordinates, nil +} diff --git a/go/vt/orchestrator/inst/instance_binlog_dao.go b/go/vt/orchestrator/inst/instance_binlog_dao.go new file mode 100644 index 0000000000..2dcf1f1e1d --- /dev/null +++ b/go/vt/orchestrator/inst/instance_binlog_dao.go @@ -0,0 +1,864 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "strings" + "time" + + "github.com/patrickmn/go-cache" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/math" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +const maxEmptyBinlogFiles int = 10 +const maxEventInfoDisplayLength int = 200 + +var instanceBinlogEntryCache *cache.Cache + +func init() { + go initializeBinlogDaoPostConfiguration() +} + +func initializeBinlogDaoPostConfiguration() { + config.WaitForConfigurationToBeLoaded() + + instanceBinlogEntryCache = cache.New(time.Duration(10)*time.Minute, time.Minute) +} + +func compilePseudoGTIDPattern() (pseudoGTIDRegexp *regexp.Regexp, err error) { + log.Debugf("PseudoGTIDPatternIsFixedSubstring: %+v", config.Config.PseudoGTIDPatternIsFixedSubstring) + if config.Config.PseudoGTIDPatternIsFixedSubstring { + return nil, nil + } + log.Debugf("Compiling PseudoGTIDPattern: %q", config.Config.PseudoGTIDPattern) + return regexp.Compile(config.Config.PseudoGTIDPattern) +} + +// pseudoGTIDMatches attempts to match given string with pseudo GTID pattern/text. +func pseudoGTIDMatches(pseudoGTIDRegexp *regexp.Regexp, binlogEntryInfo string) (found bool) { + if config.Config.PseudoGTIDPatternIsFixedSubstring { + return strings.Contains(binlogEntryInfo, config.Config.PseudoGTIDPattern) + } + return pseudoGTIDRegexp.MatchString(binlogEntryInfo) +} + +func getInstanceBinlogEntryKey(instance *Instance, entry string) string { + return fmt.Sprintf("%s;%s", instance.Key.DisplayString(), entry) +} + +// Try and find the last position of a pseudo GTID query entry in the given binary log. +// Also return the full text of that entry. +// maxCoordinates is the position beyond which we should not read. This is relevant when reading relay logs; in particular, +// the last relay log. We must be careful not to scan for Pseudo-GTID entries past the position executed by the SQL thread. +// maxCoordinates == nil means no limit. +func getLastPseudoGTIDEntryInBinlog(pseudoGTIDRegexp *regexp.Regexp, instanceKey *InstanceKey, binlog string, binlogType BinlogType, minCoordinates *BinlogCoordinates, maxCoordinates *BinlogCoordinates) (*BinlogCoordinates, string, error) { + if binlog == "" { + return nil, "", log.Errorf("getLastPseudoGTIDEntryInBinlog: empty binlog file name for %+v. maxCoordinates = %+v", *instanceKey, maxCoordinates) + } + binlogCoordinates := BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: binlogType} + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return nil, "", err + } + + moreRowsExpected := true + var nextPos int64 = 0 + var relyLogMinPos int64 = 0 + if minCoordinates != nil && minCoordinates.LogFile == binlog { + log.Debugf("getLastPseudoGTIDEntryInBinlog: starting with %+v", *minCoordinates) + nextPos = minCoordinates.LogPos + relyLogMinPos = minCoordinates.LogPos + } + step := 0 + + entryText := "" + for moreRowsExpected { + query := "" + if binlogCoordinates.Type == BinaryLog { + query = fmt.Sprintf("show binlog events in '%s' FROM %d LIMIT %d", binlog, nextPos, config.Config.BinlogEventsChunkSize) + } else { + query = fmt.Sprintf("show relaylog events in '%s' FROM %d LIMIT %d,%d", binlog, relyLogMinPos, (step * config.Config.BinlogEventsChunkSize), config.Config.BinlogEventsChunkSize) + } + + moreRowsExpected = false + + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + moreRowsExpected = true + nextPos = m.GetInt64("End_log_pos") + binlogEntryInfo := m.GetString("Info") + if pseudoGTIDMatches(pseudoGTIDRegexp, binlogEntryInfo) { + if maxCoordinates != nil && maxCoordinates.SmallerThan(&BinlogCoordinates{LogFile: binlog, LogPos: m.GetInt64("Pos")}) { + // past the limitation + moreRowsExpected = false + return nil + } + binlogCoordinates.LogPos = m.GetInt64("Pos") + entryText = binlogEntryInfo + // Found a match. But we keep searching: we're interested in the LAST entry, and, alas, + // we can only search in ASCENDING order... + } + return nil + }) + if err != nil { + return nil, "", err + } + step++ + } + + // Not found? return nil. an error is reserved to SQL problems. + if binlogCoordinates.LogPos == 0 { + return nil, "", nil + } + return &binlogCoordinates, entryText, err +} + +// getLastPseudoGTIDEntryInInstance will search for the last pseudo GTID entry in an instance's binary logs. Arguments: +// - instance +// - minBinlogCoordinates: a hint, suggested coordinates to start with. The search will _attempt_ to begin search from +// these coordinates, but if search is empty, then we failback to full search, ignoring this hint +// - maxBinlogCoordinates: a hard limit on the maximum position we're allowed to investigate. +// - exhaustiveSearch: when 'true', continue iterating binary logs. When 'false', only investigate most recent binary log. +func getLastPseudoGTIDEntryInInstance(instance *Instance, minBinlogCoordinates *BinlogCoordinates, maxBinlogCoordinates *BinlogCoordinates, exhaustiveSearch bool) (*BinlogCoordinates, string, error) { + pseudoGTIDRegexp, err := compilePseudoGTIDPattern() + if err != nil { + return nil, "", err + } + // Look for last GTID in instance: + currentBinlog := instance.SelfBinlogCoordinates + + err = nil + for err == nil { + log.Debugf("Searching for latest pseudo gtid entry in binlog %+v of %+v", currentBinlog.LogFile, instance.Key) + resultCoordinates, entryInfo, err := getLastPseudoGTIDEntryInBinlog(pseudoGTIDRegexp, &instance.Key, currentBinlog.LogFile, BinaryLog, minBinlogCoordinates, maxBinlogCoordinates) + if err != nil { + return nil, "", err + } + if resultCoordinates != nil { + log.Debugf("Found pseudo gtid entry in %+v, %+v", instance.Key, resultCoordinates) + return resultCoordinates, entryInfo, err + } + if !exhaustiveSearch { + log.Debugf("Not an exhaustive search. Bailing out") + break + } + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentBinlog.LogFile { + // We tried and failed with the minBinlogCoordinates heuristic/hint. We no longer require it, + // and continue with exhaustive search, on same binlog. + minBinlogCoordinates = nil + log.Debugf("Heuristic binlog search failed; continuing exhaustive search") + // And we do NOT iterate the log file: we scan same log file again, with no heuristic + //return nil, "", log.Errorf("past minBinlogCoordinates (%+v); skipping iteration over rest of binary logs", *minBinlogCoordinates) + } else { + currentBinlog, err = currentBinlog.PreviousFileCoordinates() + if err != nil { + return nil, "", err + } + } + } + return nil, "", log.Errorf("Cannot find pseudo GTID entry in binlogs of %+v", instance.Key) +} + +func getLastPseudoGTIDEntryInRelayLogs(instance *Instance, minBinlogCoordinates *BinlogCoordinates, recordedInstanceRelayLogCoordinates BinlogCoordinates, exhaustiveSearch bool) (*BinlogCoordinates, string, error) { + // Look for last GTID in relay logs: + // Since MySQL does not provide with a SHOW RELAY LOGS command, we heuristically start from current + // relay log (indiciated by Relay_log_file) and walk backwards. + // Eventually we will hit a relay log name which does not exist. + pseudoGTIDRegexp, err := compilePseudoGTIDPattern() + if err != nil { + return nil, "", err + } + + currentRelayLog := recordedInstanceRelayLogCoordinates + err = nil + for err == nil { + log.Debugf("Searching for latest pseudo gtid entry in relaylog %+v of %+v, up to pos %+v", currentRelayLog.LogFile, instance.Key, recordedInstanceRelayLogCoordinates) + if resultCoordinates, entryInfo, err := getLastPseudoGTIDEntryInBinlog(pseudoGTIDRegexp, &instance.Key, currentRelayLog.LogFile, RelayLog, minBinlogCoordinates, &recordedInstanceRelayLogCoordinates); err != nil { + return nil, "", err + } else if resultCoordinates != nil { + log.Debugf("Found pseudo gtid entry in %+v, %+v", instance.Key, resultCoordinates) + return resultCoordinates, entryInfo, err + } + if !exhaustiveSearch { + break + } + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentRelayLog.LogFile { + // We tried and failed with the minBinlogCoordinates hint. We no longer require it, + // and continue with exhaustive search. + minBinlogCoordinates = nil + log.Debugf("Heuristic relaylog search failed; continuing exhaustive search") + // And we do NOT iterate to previous log file: we scan same log file again, with no heuristic + } else { + currentRelayLog, err = currentRelayLog.PreviousFileCoordinates() + } + } + return nil, "", log.Errorf("Cannot find pseudo GTID entry in relay logs of %+v", instance.Key) +} + +func readBinlogEvent(binlogEvent *BinlogEvent, m sqlutils.RowMap) error { + binlogEvent.NextEventPos = m.GetInt64("End_log_pos") + binlogEvent.Coordinates.LogPos = m.GetInt64("Pos") + binlogEvent.EventType = m.GetString("Event_type") + binlogEvent.Info = m.GetString("Info") + return nil +} + +func ReadBinlogEventAtRelayLogCoordinates(instanceKey *InstanceKey, relaylogCoordinates *BinlogCoordinates) (binlogEvent *BinlogEvent, err error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return nil, err + } + + query := fmt.Sprintf("show relaylog events in '%s' FROM %d LIMIT 1", relaylogCoordinates.LogFile, relaylogCoordinates.LogPos) + binlogEvent = &BinlogEvent{ + Coordinates: *relaylogCoordinates, + } + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + return readBinlogEvent(binlogEvent, m) + }) + return binlogEvent, err +} + +// Try and find the last position of a pseudo GTID query entry in the given binary log. +// Also return the full text of that entry. +// maxCoordinates is the position beyond which we should not read. This is relevant when reading relay logs; in particular, +// the last relay log. We must be careful not to scan for Pseudo-GTID entries past the position executed by the SQL thread. +// maxCoordinates == nil means no limit. +func getLastExecutedEntryInRelaylog(instanceKey *InstanceKey, binlog string, minCoordinates *BinlogCoordinates, maxCoordinates *BinlogCoordinates) (binlogEvent *BinlogEvent, err error) { + if binlog == "" { + return nil, log.Errorf("getLastExecutedEntryInRelaylog: empty binlog file name for %+v. maxCoordinates = %+v", *instanceKey, maxCoordinates) + } + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return nil, err + } + binlogEvent = &BinlogEvent{ + Coordinates: BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: RelayLog}, + } + + moreRowsExpected := true + var relyLogMinPos int64 = 0 + if minCoordinates != nil && minCoordinates.LogFile == binlog { + log.Debugf("getLastExecutedEntryInRelaylog: starting with %+v", *minCoordinates) + relyLogMinPos = minCoordinates.LogPos + } + + step := 0 + for moreRowsExpected { + query := fmt.Sprintf("show relaylog events in '%s' FROM %d LIMIT %d,%d", binlog, relyLogMinPos, (step * config.Config.BinlogEventsChunkSize), config.Config.BinlogEventsChunkSize) + + moreRowsExpected = false + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + moreRowsExpected = true + return readBinlogEvent(binlogEvent, m) + }) + if err != nil { + return nil, err + } + step++ + } + + // Not found? return nil. an error is reserved to SQL problems. + if binlogEvent.Coordinates.LogPos == 0 { + return nil, nil + } + return binlogEvent, err +} + +func GetLastExecutedEntryInRelayLogs(instance *Instance, minBinlogCoordinates *BinlogCoordinates, recordedInstanceRelayLogCoordinates BinlogCoordinates) (binlogEvent *BinlogEvent, err error) { + // Look for last GTID in relay logs: + // Since MySQL does not provide with a SHOW RELAY LOGS command, we heuristically start from current + // relay log (indiciated by Relay_log_file) and walk backwards. + + currentRelayLog := recordedInstanceRelayLogCoordinates + for err == nil { + log.Debugf("Searching for latest entry in relaylog %+v of %+v, up to pos %+v", currentRelayLog.LogFile, instance.Key, recordedInstanceRelayLogCoordinates) + if binlogEvent, err = getLastExecutedEntryInRelaylog(&instance.Key, currentRelayLog.LogFile, minBinlogCoordinates, &recordedInstanceRelayLogCoordinates); err != nil { + return nil, err + } else if binlogEvent != nil { + log.Debugf("Found entry in %+v, %+v", instance.Key, binlogEvent.Coordinates) + return binlogEvent, err + } + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentRelayLog.LogFile { + // We tried and failed with the minBinlogCoordinates hint. We no longer require it, + // and continue with exhaustive search. + minBinlogCoordinates = nil + log.Debugf("Heuristic relaylog search failed; continuing exhaustive search") + // And we do NOT iterate to previous log file: we scan same log faile again, with no heuristic + } else { + currentRelayLog, err = currentRelayLog.PreviousFileCoordinates() + } + } + return binlogEvent, err +} + +// SearchBinlogEntryInRelaylog +func searchEventInRelaylog(instanceKey *InstanceKey, binlog string, searchEvent *BinlogEvent, minCoordinates *BinlogCoordinates) (binlogCoordinates, nextCoordinates *BinlogCoordinates, found bool, err error) { + binlogCoordinates = &BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: RelayLog} + nextCoordinates = &BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: RelayLog} + if binlog == "" { + return binlogCoordinates, nextCoordinates, false, log.Errorf("SearchEventInRelaylog: empty relaylog file name for %+v", *instanceKey) + } + + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return binlogCoordinates, nextCoordinates, false, err + } + + moreRowsExpected := true + var relyLogMinPos int64 = 0 + if minCoordinates != nil && minCoordinates.LogFile == binlog { + log.Debugf("SearchEventInRelaylog: starting with %+v", *minCoordinates) + relyLogMinPos = minCoordinates.LogPos + } + binlogEvent := &BinlogEvent{ + Coordinates: BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: RelayLog}, + } + + skipRestOfBinlog := false + + step := 0 + for moreRowsExpected { + query := fmt.Sprintf("show relaylog events in '%s' FROM %d LIMIT %d,%d", binlog, relyLogMinPos, (step * config.Config.BinlogEventsChunkSize), config.Config.BinlogEventsChunkSize) + + // We don't know in advance when we will hit the end of the binlog. We will implicitly understand it when our + // `show binlog events` query does not return any row. + moreRowsExpected = false + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + if binlogCoordinates.LogPos != 0 && nextCoordinates.LogPos != 0 { + // Entry found! + skipRestOfBinlog = true + return nil + } + if skipRestOfBinlog { + return nil + } + moreRowsExpected = true + + if binlogCoordinates.LogPos == 0 { + readBinlogEvent(binlogEvent, m) + if binlogEvent.EqualsIgnoreCoordinates(searchEvent) { + // found it! + binlogCoordinates.LogPos = m.GetInt64("Pos") + } + } else if nextCoordinates.LogPos == 0 { + // found binlogCoordinates: the next coordinates are nextCoordinates :P + nextCoordinates.LogPos = m.GetInt64("Pos") + } + return nil + }) + if err != nil { + return binlogCoordinates, nextCoordinates, (binlogCoordinates.LogPos != 0), err + } + if skipRestOfBinlog { + return binlogCoordinates, nextCoordinates, (binlogCoordinates.LogPos != 0), err + } + step++ + } + return binlogCoordinates, nextCoordinates, (binlogCoordinates.LogPos != 0), err +} + +func SearchEventInRelayLogs(searchEvent *BinlogEvent, instance *Instance, minBinlogCoordinates *BinlogCoordinates, recordedInstanceRelayLogCoordinates BinlogCoordinates) (binlogCoordinates, nextCoordinates *BinlogCoordinates, found bool, err error) { + // Since MySQL does not provide with a SHOW RELAY LOGS command, we heuristically start from current + // relay log (indiciated by Relay_log_file) and walk backwards. + log.Debugf("will search for event %+v", *searchEvent) + if minBinlogCoordinates != nil { + log.Debugf("Starting with coordinates: %+v", *minBinlogCoordinates) + } + currentRelayLog := recordedInstanceRelayLogCoordinates + for err == nil { + log.Debugf("Searching for event in relaylog %+v of %+v, up to pos %+v", currentRelayLog.LogFile, instance.Key, recordedInstanceRelayLogCoordinates) + if binlogCoordinates, nextCoordinates, found, err = searchEventInRelaylog(&instance.Key, currentRelayLog.LogFile, searchEvent, minBinlogCoordinates); err != nil { + return nil, nil, false, err + } else if binlogCoordinates != nil && found { + log.Debugf("Found event in %+v, %+v", instance.Key, *binlogCoordinates) + return binlogCoordinates, nextCoordinates, found, err + } + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentRelayLog.LogFile { + // We tried and failed with the minBinlogCoordinates hint. We no longer require it, + // and continue with exhaustive search. + minBinlogCoordinates = nil + log.Debugf("Heuristic relaylog search failed; continuing exhaustive search") + // And we do NOT iterate to previous log file: we scan same log faile again, with no heuristic + } else { + currentRelayLog, err = currentRelayLog.PreviousFileCoordinates() + } + } + return binlogCoordinates, nextCoordinates, found, err +} + +// SearchEntryInBinlog Given a binlog entry text (query), search it in the given binary log of a given instance +func SearchEntryInBinlog(pseudoGTIDRegexp *regexp.Regexp, instanceKey *InstanceKey, binlog string, entryText string, monotonicPseudoGTIDEntries bool, minBinlogCoordinates *BinlogCoordinates) (BinlogCoordinates, bool, error) { + binlogCoordinates := BinlogCoordinates{LogFile: binlog, LogPos: 0, Type: BinaryLog} + if binlog == "" { + return binlogCoordinates, false, log.Errorf("SearchEntryInBinlog: empty binlog file name for %+v", *instanceKey) + } + + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return binlogCoordinates, false, err + } + + moreRowsExpected := true + skipRestOfBinlog := false + alreadyMatchedAscendingPseudoGTID := false + var nextPos int64 = 0 + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == binlog { + log.Debugf("SearchEntryInBinlog: starting with %+v", *minBinlogCoordinates) + nextPos = minBinlogCoordinates.LogPos + } + + for moreRowsExpected { + query := fmt.Sprintf("show binlog events in '%s' FROM %d LIMIT %d", binlog, nextPos, config.Config.BinlogEventsChunkSize) + + // We don't know in advance when we will hit the end of the binlog. We will implicitly understand it when our + // `show binlog events` query does not return any row. + moreRowsExpected = false + + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + if binlogCoordinates.LogPos != 0 { + // Entry found! + skipRestOfBinlog = true + return nil + } + if skipRestOfBinlog { + return nil + } + moreRowsExpected = true + nextPos = m.GetInt64("End_log_pos") + binlogEntryInfo := m.GetString("Info") + // + if binlogEntryInfo == entryText { + // found it! + binlogCoordinates.LogPos = m.GetInt64("Pos") + } else if monotonicPseudoGTIDEntries && !alreadyMatchedAscendingPseudoGTID { + // This part assumes we're searching for Pseudo-GTID.Typically that is the case, however this function can + // also be used for generic searches through the binary log. + // More heavyweight computation here. Need to verify whether the binlog entry we have is a pseudo-gtid entry + // We only want to check for ASCENDING once in the top of the binary log. + // If we find the first entry to be higher than the searched one, clearly we are done. + // If not, then by virtue of binary logs, we still have to full-scan the entrie binlog sequentially; we + // do not check again for ASCENDING (no point), so we save up CPU energy wasted in regexp. + if pseudoGTIDMatches(pseudoGTIDRegexp, binlogEntryInfo) { + alreadyMatchedAscendingPseudoGTID = true + log.Debugf("Matched ascending Pseudo-GTID entry in %+v", binlog) + if binlogEntryInfo > entryText { + // Entries ascending, and current entry is larger than the one we are searching for. + // There is no need to scan further on. We can skip the entire binlog + log.Debugf(`Pseudo GTID entries are monotonic and we hit "%+v" > "%+v"; skipping binlog %+v`, m.GetString("Info"), entryText, binlogCoordinates.LogFile) + skipRestOfBinlog = true + return nil + } + } + } + return nil + }) + if err != nil { + return binlogCoordinates, (binlogCoordinates.LogPos != 0), err + } + if skipRestOfBinlog { + return binlogCoordinates, (binlogCoordinates.LogPos != 0), err + } + } + + return binlogCoordinates, (binlogCoordinates.LogPos != 0), err +} + +// SearchEntryInInstanceBinlogs will search for a specific text entry within the binary logs of a given instance. +func SearchEntryInInstanceBinlogs(instance *Instance, entryText string, monotonicPseudoGTIDEntries bool, minBinlogCoordinates *BinlogCoordinates) (*BinlogCoordinates, error) { + pseudoGTIDRegexp, err := compilePseudoGTIDPattern() + if err != nil { + return nil, err + } + cacheKey := getInstanceBinlogEntryKey(instance, entryText) + coords, found := instanceBinlogEntryCache.Get(cacheKey) + if found { + // This is wonderful. We can skip the tedious GTID search in the binary log + log.Debugf("Found instance Pseudo GTID entry coordinates in cache: %+v, %+v, %+v", instance.Key, entryText, coords) + return coords.(*BinlogCoordinates), nil + } + + // Look for GTID entry in given instance: + log.Debugf("Searching for given pseudo gtid entry in %+v. monotonicPseudoGTIDEntries=%+v", instance.Key, monotonicPseudoGTIDEntries) + currentBinlog := instance.SelfBinlogCoordinates + err = nil + for { + log.Debugf("Searching for given pseudo gtid entry in binlog %+v of %+v", currentBinlog.LogFile, instance.Key) + // loop iteration per binary log. This might turn to be a heavyweight operation. We wish to throttle the operation such that + // the instance does not suffer. If it is a replica, we will only act as long as it's not lagging too much. + if instance.ReplicaRunning() { + for { + log.Debugf("%+v is a replicating replica. Verifying lag", instance.Key) + instance, err = ReadTopologyInstance(&instance.Key) + if err != nil { + break + } + if instance.HasReasonableMaintenanceReplicationLag() { + // is good to go! + break + } + log.Debugf("lag is too high on %+v. Throttling the search for pseudo gtid entry", instance.Key) + time.Sleep(time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds) * time.Second) + } + } + var resultCoordinates BinlogCoordinates + var found bool = false + resultCoordinates, found, err = SearchEntryInBinlog(pseudoGTIDRegexp, &instance.Key, currentBinlog.LogFile, entryText, monotonicPseudoGTIDEntries, minBinlogCoordinates) + if err != nil { + break + } + if found { + log.Debugf("Matched entry in %+v: %+v", instance.Key, resultCoordinates) + instanceBinlogEntryCache.Set(cacheKey, &resultCoordinates, 0) + return &resultCoordinates, nil + } + // Got here? Unfound. Keep looking + if minBinlogCoordinates != nil && minBinlogCoordinates.LogFile == currentBinlog.LogFile { + log.Debugf("Heuristic master binary logs search failed; continuing exhaustive search") + minBinlogCoordinates = nil + } else { + currentBinlog, err = currentBinlog.PreviousFileCoordinates() + if err != nil { + break + } + log.Debugf("- Will move next to binlog %+v", currentBinlog.LogFile) + } + } + + return nil, log.Errorf("Cannot match pseudo GTID entry in binlogs of %+v; err: %+v", instance.Key, err) +} + +// Read (as much as possible of) a chunk of binary log events starting the given startingCoordinates +func readBinlogEventsChunk(instanceKey *InstanceKey, startingCoordinates BinlogCoordinates) ([]BinlogEvent, error) { + events := []BinlogEvent{} + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return events, err + } + commandToken := math.TernaryString(startingCoordinates.Type == BinaryLog, "binlog", "relaylog") + if startingCoordinates.LogFile == "" { + return events, log.Errorf("readBinlogEventsChunk: empty binlog file name for %+v.", *instanceKey) + } + query := fmt.Sprintf("show %s events in '%s' FROM %d LIMIT %d", commandToken, startingCoordinates.LogFile, startingCoordinates.LogPos, config.Config.BinlogEventsChunkSize) + err = sqlutils.QueryRowsMap(db, query, func(m sqlutils.RowMap) error { + binlogEvent := BinlogEvent{} + binlogEvent.Coordinates.LogFile = m.GetString("Log_name") + binlogEvent.Coordinates.LogPos = m.GetInt64("Pos") + binlogEvent.Coordinates.Type = startingCoordinates.Type + binlogEvent.NextEventPos = m.GetInt64("End_log_pos") + binlogEvent.EventType = m.GetString("Event_type") + binlogEvent.Info = m.GetString("Info") + + events = append(events, binlogEvent) + return nil + }) + return events, err +} + +// Return the next chunk of binlog events; skip to next binary log file if need be; return empty result only +// if reached end of binary logs +func getNextBinlogEventsChunk(instance *Instance, startingCoordinates BinlogCoordinates, numEmptyBinlogs int) ([]BinlogEvent, error) { + if numEmptyBinlogs > maxEmptyBinlogFiles { + log.Debugf("Reached maxEmptyBinlogFiles (%d) at %+v", maxEmptyBinlogFiles, startingCoordinates) + // Give up and return empty results + return []BinlogEvent{}, nil + } + coordinatesExceededCurrent := false + switch startingCoordinates.Type { + case BinaryLog: + coordinatesExceededCurrent = instance.SelfBinlogCoordinates.FileSmallerThan(&startingCoordinates) + case RelayLog: + coordinatesExceededCurrent = instance.RelaylogCoordinates.FileSmallerThan(&startingCoordinates) + } + if coordinatesExceededCurrent { + // We're past the last file. This is a non-error: there are no more events. + log.Debugf("Coordinates overflow: %+v; terminating search", startingCoordinates) + return []BinlogEvent{}, nil + } + events, err := readBinlogEventsChunk(&instance.Key, startingCoordinates) + if err != nil { + return events, err + } + if len(events) > 0 { + log.Debugf("Returning %d events at %+v", len(events), startingCoordinates) + return events, nil + } + + // events are empty + if nextCoordinates, err := instance.GetNextBinaryLog(startingCoordinates); err == nil { + log.Debugf("Recursing into %+v", nextCoordinates) + return getNextBinlogEventsChunk(instance, nextCoordinates, numEmptyBinlogs+1) + } + // on error + return events, err +} + +// used by GetNextBinlogCoordinatesToMatch to format debug information appropriately +// format the event information in debug output +func formatEventCleanly(event BinlogEvent, length *int) string { + return fmt.Sprintf("%+v %+v; %+v", rpad(event.Coordinates, length), event.EventType, strings.Split(strings.TrimSpace(event.Info), "\n")[0]) +} + +// Only do special filtering if instance is MySQL-5.7 and other +// is MySQL-5.6 and in pseudo-gtid mode. +// returns applyInstanceSpecialFiltering, applyOtherSpecialFiltering, err +func special56To57filterProcessing(instance *Instance, other *Instance) (bool, bool, error) { + // be paranoid + if instance == nil || other == nil { + return false, false, fmt.Errorf("special56To57filterProcessing: instance or other is nil. Should not happen") + } + + filterInstance := instance.FlavorNameAndMajorVersion() == "MySQL-5.7" && // 5.7 replica + other.FlavorNameAndMajorVersion() == "MySQL-5.6" // replicating under 5.6 master + + // The logic for other is a bit weird and may require us + // to check the instance's master. To avoid this do some + // preliminary checks first to avoid the "master" access + // unless absolutely needed. + if instance.LogBinEnabled || // instance writes binlogs (not relay logs) + instance.FlavorNameAndMajorVersion() != "MySQL-5.7" || // instance NOT 5.7 replica + other.FlavorNameAndMajorVersion() != "MySQL-5.7" { // new master is NOT 5.7 + return filterInstance, false /* good exit status avoiding checking master */, nil + } + + // We need to check if the master is 5.6 + // - Do not call GetInstanceMaster() as that requires the + // master to be available, and this code may be called + // during a master/intermediate master failover when the + // master may not actually be reachable. + master, _, err := ReadInstance(&instance.MasterKey) + if err != nil { + return false, false, log.Errorf("special56To57filterProcessing: ReadInstance(%+v) fails: %+v", instance.MasterKey, err) + } + + filterOther := master.FlavorNameAndMajorVersion() == "MySQL-5.6" // master(instance) == 5.6 + + return filterInstance, filterOther, nil +} + +// The event type to filter out +const anonymousGTIDNextEvent = "SET @@SESSION.GTID_NEXT= 'ANONYMOUS'" + +// check if the event is one we want to skip. +func specialEventToSkip(event *BinlogEvent) bool { + if event != nil && strings.Index(event.Info, anonymousGTIDNextEvent) >= 0 { + return true + } + return false +} + +// GetNextBinlogCoordinatesToMatch is given a twin-coordinates couple for a would-be replica (instance) and another +// instance (other). +// This is part of the match-below process, and is the heart of the operation: matching the binlog events starting +// the twin-coordinates (where both share the same Pseudo-GTID) until "instance" runs out of entries, hopefully +// before "other" runs out. +// If "other" runs out that means "instance" is more advanced in replication than "other", in which case we can't +// turn it into a replica of "other". +func GetNextBinlogCoordinatesToMatch( + instance *Instance, + instanceCoordinates BinlogCoordinates, + recordedInstanceRelayLogCoordinates BinlogCoordinates, + maxBinlogCoordinates *BinlogCoordinates, + other *Instance, + otherCoordinates BinlogCoordinates) (*BinlogCoordinates, int, error) { + + const noMatchedEvents int = 0 // to make return statements' intent clearer + + // create instanceCursor for scanning instance binlog events + fetchNextEvents := func(binlogCoordinates BinlogCoordinates) ([]BinlogEvent, error) { + return getNextBinlogEventsChunk(instance, binlogCoordinates, 0) + } + instanceCursor := NewBinlogEventCursor(instanceCoordinates, fetchNextEvents) + + // create otherCursor for scanning other binlog events + fetchOtherNextEvents := func(binlogCoordinates BinlogCoordinates) ([]BinlogEvent, error) { + return getNextBinlogEventsChunk(other, binlogCoordinates, 0) + } + otherCursor := NewBinlogEventCursor(otherCoordinates, fetchOtherNextEvents) + + // for 5.6 to 5.7 replication special processing may be needed. + applyInstanceSpecialFiltering, applyOtherSpecialFiltering, err := special56To57filterProcessing(instance, other) + if err != nil { + return nil, noMatchedEvents, log.Errore(err) + } + + var ( + beautifyCoordinatesLength int = 0 + countMatchedEvents int = 0 + lastConsumedEventCoordinates BinlogCoordinates + ) + + for { + // Exhaust binlogs/relaylogs on instance. While iterating them, also iterate the otherInstance binlogs. + // We expect entries on both to match, sequentially, until instance's binlogs/relaylogs are exhausted. + var ( + // the whole event to make things simpler + instanceEvent BinlogEvent + otherEvent BinlogEvent + ) + + { + // we may need to skip Anonymous GTID Next Events so loop here over any we find + var event *BinlogEvent + var err error + for done := false; !done; { + // Extract next binlog/relaylog entry from instance: + event, err = instanceCursor.nextRealEvent(0) + if err != nil { + return nil, noMatchedEvents, log.Errore(err) + } + if event != nil { + lastConsumedEventCoordinates = event.Coordinates + } + if event == nil || !applyInstanceSpecialFiltering || !specialEventToSkip(event) { + done = true + } + } + + switch instanceCoordinates.Type { + case BinaryLog: + if event == nil { + // end of binary logs for instance: + otherNextCoordinates, err := otherCursor.getNextCoordinates() + if err != nil { + return nil, noMatchedEvents, log.Errore(err) + } + instanceNextCoordinates, err := instanceCursor.getNextCoordinates() + if err != nil { + return nil, noMatchedEvents, log.Errore(err) + } + // sanity check + if instanceNextCoordinates.SmallerThan(&instance.SelfBinlogCoordinates) { + return nil, noMatchedEvents, log.Errorf("Unexpected problem: instance binlog iteration ended before self coordinates. Ended with: %+v, self coordinates: %+v", instanceNextCoordinates, instance.SelfBinlogCoordinates) + } + // Possible good exit point. + log.Debugf("Reached end of binary logs for instance, at %+v. Other coordinates: %+v", instanceNextCoordinates, otherNextCoordinates) + return &otherNextCoordinates, countMatchedEvents, nil + } + case RelayLog: + // Argghhhh! SHOW RELAY LOG EVENTS IN '...' statement returns CRAPPY values for End_log_pos: + // instead of returning the end log pos of the current statement in the *relay log*, it shows + // the end log pos of the matching statement in the *master's binary log*! + // Yes, there's logic to this. But this means the next-ccordinates are meaningless. + // As result, in the case where we exhaust (following) the relay log, we cannot do our last + // nice sanity test that we've indeed reached the Relay_log_pos coordinate; we are only at the + // last statement, which is SMALLER than Relay_log_pos; and there isn't a "Rotate" entry to make + // a place holder or anything. The log just ends and we can't be absolutely certain that the next + // statement is indeed (futuristically) as End_log_pos. + endOfScan := false + if event == nil { + // End of relay log... + endOfScan = true + log.Debugf("Reached end of relay log at %+v", recordedInstanceRelayLogCoordinates) + } else if recordedInstanceRelayLogCoordinates.Equals(&event.Coordinates) { + // We've passed the maxScanInstanceCoordinates (applies for relay logs) + endOfScan = true + log.Debugf("Reached replica relay log coordinates at %+v", recordedInstanceRelayLogCoordinates) + } else if recordedInstanceRelayLogCoordinates.SmallerThan(&event.Coordinates) { + return nil, noMatchedEvents, log.Errorf("Unexpected problem: relay log scan passed relay log position without hitting it. Ended with: %+v, relay log position: %+v", event.Coordinates, recordedInstanceRelayLogCoordinates) + } + if endOfScan { + // end of binary logs for instance: + otherNextCoordinates, err := otherCursor.getNextCoordinates() + if err != nil { + log.Debugf("otherCursor.getNextCoordinates() failed. otherCoordinates=%+v, cached events in cursor: %d; index=%d", otherCoordinates, len(otherCursor.cachedEvents), otherCursor.currentEventIndex) + return nil, noMatchedEvents, log.Errore(err) + } + // Possible good exit point. + // No further sanity checks (read the above lengthy explanation) + log.Debugf("Reached limit of relay logs for instance, just after %+v. Other coordinates: %+v", lastConsumedEventCoordinates, otherNextCoordinates) + return &otherNextCoordinates, countMatchedEvents, nil + } + } + + instanceEvent = *event // make a physical copy + log.Debugf("> %s", formatEventCleanly(instanceEvent, &beautifyCoordinatesLength)) + } + { + // Extract next binlog/relaylog entry from other (intended master): + // - this must have binlogs. We may need to filter anonymous events if we were processing + // a relay log on instance and the instance's master runs 5.6 + var event *BinlogEvent + var err error + for done := false; !done; { + // Extract next binlog entry from other: + event, err = otherCursor.nextRealEvent(0) + if err != nil { + return nil, noMatchedEvents, log.Errore(err) + } + if event == nil || !applyOtherSpecialFiltering || !specialEventToSkip(event) { + done = true + } + } + + if event == nil { + // end of binary logs for otherInstance: this is unexpected and means instance is more advanced + // than otherInstance + return nil, noMatchedEvents, log.Errorf("Unexpected end of binary logs for assumed master (%+v). This means the instance which attempted to be a replica (%+v) was more advanced. Try the other way round", other.Key, instance.Key) + } + + otherEvent = *event // make a physical copy + log.Debugf("< %s", formatEventCleanly(otherEvent, &beautifyCoordinatesLength)) + } + // Verify things are sane (the two extracted entries are identical): + // (not strictly required by the algorithm but adds such a lovely self-sanity-testing essence) + if instanceEvent.Info != otherEvent.Info { + return nil, noMatchedEvents, log.Errorf("Mismatching entries, aborting: %+v <-> %+v", instanceEvent.Info, otherEvent.Info) + } + countMatchedEvents++ + if maxBinlogCoordinates != nil { + // Possible good exit point. + // Not searching till end of binary logs/relay log exec pos. Instead, we're stopping at an instructed position. + if instanceEvent.Coordinates.Equals(maxBinlogCoordinates) { + log.Debugf("maxBinlogCoordinates specified as %+v and reached. Stopping", *maxBinlogCoordinates) + return &otherEvent.Coordinates, countMatchedEvents, nil + } else if maxBinlogCoordinates.SmallerThan(&instanceEvent.Coordinates) { + return nil, noMatchedEvents, log.Errorf("maxBinlogCoordinates (%+v) exceeded but not met", *maxBinlogCoordinates) + } + } + } + // Won't get here +} + +func GetPreviousGTIDs(instanceKey *InstanceKey, binlog string) (previousGTIDs *OracleGtidSet, err error) { + if binlog == "" { + return nil, log.Errorf("GetPreviousGTIDs: empty binlog file name for %+v", *instanceKey) + } + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return nil, err + } + + query := fmt.Sprintf("show binlog events in '%s' LIMIT 5", binlog) + + err = sqlutils.QueryRowsMapBuffered(db, query, func(m sqlutils.RowMap) error { + eventType := m.GetString("Event_type") + if eventType == "Previous_gtids" { + var e error + if previousGTIDs, e = NewOracleGtidSet(m.GetString("Info")); e != nil { + return e + } + } + return nil + }) + return previousGTIDs, err +} diff --git a/go/vt/orchestrator/inst/instance_dao.go b/go/vt/orchestrator/inst/instance_dao.go new file mode 100644 index 0000000000..5e0f2ce266 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_dao.go @@ -0,0 +1,3314 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "bytes" + "database/sql" + "errors" + "fmt" + "regexp" + "runtime" + "sort" + "strconv" + "strings" + "sync" + "time" + + "github.com/go-sql-driver/mysql" + + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" + "github.com/sjmudd/stopwatch" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/math" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + + "vitess.io/vitess/go/vt/orchestrator/attributes" + "vitess.io/vitess/go/vt/orchestrator/collection" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/kv" + "vitess.io/vitess/go/vt/orchestrator/metrics/query" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +const ( + backendDBConcurrency = 20 + retryInstanceFunctionCount = 5 + retryInterval = 500 * time.Millisecond + error1045AccessDenied = "Error 1045: Access denied for user" + errorConnectionRefused = "getsockopt: connection refused" + errorNoSuchHost = "no such host" + errorIOTimeout = "i/o timeout" +) + +var instanceReadChan = make(chan bool, backendDBConcurrency) +var instanceWriteChan = make(chan bool, backendDBConcurrency) + +// InstancesByCountReplicas is a sortable type for Instance +type InstancesByCountReplicas [](*Instance) + +func (this InstancesByCountReplicas) Len() int { return len(this) } +func (this InstancesByCountReplicas) Swap(i, j int) { this[i], this[j] = this[j], this[i] } +func (this InstancesByCountReplicas) Less(i, j int) bool { + return len(this[i].Replicas) < len(this[j].Replicas) +} + +// Constant strings for Group Replication information +// See https://dev.mysql.com/doc/refman/8.0/en/replication-group-members-table.html for additional information. +const ( + // Group member roles + GroupReplicationMemberRolePrimary = "PRIMARY" + GroupReplicationMemberRoleSecondary = "SECONDARY" + // Group member states + GroupReplicationMemberStateOnline = "ONLINE" + GroupReplicationMemberStateRecovering = "RECOVERING" + GroupReplicationMemberStateOffline = "OFFLINE" + GroupReplicationMemberStateError = "ERROR" +) + +// We use this map to identify whether the query failed because the server does not support group replication or due +// to a different reason. +var GroupReplicationNotSupportedErrors = map[uint16]bool{ + // If either the group replication global variables are not known or the + // performance_schema.replication_group_members table does not exist, the host does not support group + // replication, at least in the form supported here. + 1193: true, // ERROR: 1193 (HY000): Unknown system variable 'group_replication_group_name' + 1146: true, // ERROR: 1146 (42S02): Table 'performance_schema.replication_group_members' doesn't exist +} + +// instanceKeyInformativeClusterName is a non-authoritative cache; used for auditing or general purpose. +var instanceKeyInformativeClusterName *cache.Cache +var forgetInstanceKeys *cache.Cache +var clusterInjectedPseudoGTIDCache *cache.Cache + +var accessDeniedCounter = metrics.NewCounter() +var readTopologyInstanceCounter = metrics.NewCounter() +var readInstanceCounter = metrics.NewCounter() +var writeInstanceCounter = metrics.NewCounter() +var backendWrites = collection.CreateOrReturnCollection("BACKEND_WRITES") +var writeBufferMetrics = collection.CreateOrReturnCollection("WRITE_BUFFER") +var writeBufferLatency = stopwatch.NewNamedStopwatch() + +var emptyQuotesRegexp = regexp.MustCompile(`^""$`) + +func init() { + metrics.Register("instance.access_denied", accessDeniedCounter) + metrics.Register("instance.read_topology", readTopologyInstanceCounter) + metrics.Register("instance.read", readInstanceCounter) + metrics.Register("instance.write", writeInstanceCounter) + writeBufferLatency.AddMany([]string{"wait", "write"}) + writeBufferLatency.Start("wait") + + go initializeInstanceDao() +} + +func initializeInstanceDao() { + config.WaitForConfigurationToBeLoaded() + instanceWriteBuffer = make(chan instanceUpdateObject, config.Config.InstanceWriteBufferSize) + instanceKeyInformativeClusterName = cache.New(time.Duration(config.Config.InstancePollSeconds/2)*time.Second, time.Second) + forgetInstanceKeys = cache.New(time.Duration(config.Config.InstancePollSeconds*3)*time.Second, time.Second) + clusterInjectedPseudoGTIDCache = cache.New(time.Minute, time.Second) + // spin off instance write buffer flushing + go func() { + flushTick := time.Tick(time.Duration(config.Config.InstanceFlushIntervalMilliseconds) * time.Millisecond) + for { + // it is time to flush + select { + case <-flushTick: + flushInstanceWriteBuffer() + case <-forceFlushInstanceWriteBuffer: + flushInstanceWriteBuffer() + } + } + }() +} + +// ExecDBWriteFunc chooses how to execute a write onto the database: whether synchronuously or not +func ExecDBWriteFunc(f func() error) error { + m := query.NewMetric() + + instanceWriteChan <- true + m.WaitLatency = time.Since(m.Timestamp) + + // catch the exec time and error if there is one + defer func() { + if r := recover(); r != nil { + if _, ok := r.(runtime.Error); ok { + panic(r) + } + + if s, ok := r.(string); ok { + m.Err = errors.New(s) + } else { + m.Err = r.(error) + } + } + m.ExecuteLatency = time.Since(m.Timestamp.Add(m.WaitLatency)) + backendWrites.Append(m) + <-instanceWriteChan // assume this takes no time + }() + res := f() + return res +} + +func ExpireTableData(tableName string, timestampColumn string) error { + query := fmt.Sprintf("delete from %s where %s < NOW() - INTERVAL ? DAY", tableName, timestampColumn) + writeFunc := func() error { + _, err := db.ExecOrchestrator(query, config.Config.AuditPurgeDays) + return err + } + return ExecDBWriteFunc(writeFunc) +} + +// logReadTopologyInstanceError logs an error, if applicable, for a ReadTopologyInstance operation, +// providing context and hint as for the source of the error. If there's no hint just provide the +// original error. +func logReadTopologyInstanceError(instanceKey *InstanceKey, hint string, err error) error { + if err == nil { + return nil + } + if !util.ClearToLog("ReadTopologyInstance", instanceKey.StringCode()) { + return err + } + var msg string + if hint == "" { + msg = fmt.Sprintf("ReadTopologyInstance(%+v): %+v", *instanceKey, err) + } else { + msg = fmt.Sprintf("ReadTopologyInstance(%+v) %+v: %+v", + *instanceKey, + strings.Replace(hint, "%", "%%", -1), // escape % + err) + } + return log.Errorf(msg) +} + +// ReadTopologyInstance collects information on the state of a MySQL +// server and writes the result synchronously to the orchestrator +// backend. +func ReadTopologyInstance(instanceKey *InstanceKey) (*Instance, error) { + return ReadTopologyInstanceBufferable(instanceKey, false, nil) +} + +func RetryInstanceFunction(f func() (*Instance, error)) (instance *Instance, err error) { + for i := 0; i < retryInstanceFunctionCount; i++ { + if instance, err = f(); err == nil { + return instance, nil + } + } + return instance, err +} + +// Is this an error which means that we shouldn't try going more queries for this discovery attempt? +func unrecoverableError(err error) bool { + contains := []string{ + error1045AccessDenied, + errorConnectionRefused, + errorIOTimeout, + errorNoSuchHost, + } + for _, k := range contains { + if strings.Contains(err.Error(), k) { + return true + } + } + return false +} + +// Check if the instance is a MaxScale binlog server (a proxy not a real +// MySQL server) and also update the resolved hostname +func (instance *Instance) checkMaxScale(db *sql.DB, latency *stopwatch.NamedStopwatch) (isMaxScale bool, resolvedHostname string, err error) { + if config.Config.SkipMaxScaleCheck { + return isMaxScale, resolvedHostname, err + } + + latency.Start("instance") + err = sqlutils.QueryRowsMap(db, "show variables like 'maxscale%'", func(m sqlutils.RowMap) error { + if m.GetString("Variable_name") == "MAXSCALE_VERSION" { + originalVersion := m.GetString("Value") + if originalVersion == "" { + originalVersion = m.GetString("value") + } + if originalVersion == "" { + originalVersion = "0.0.0" + } + instance.Version = originalVersion + "-maxscale" + instance.ServerID = 0 + instance.ServerUUID = "" + instance.Uptime = 0 + instance.Binlog_format = "INHERIT" + instance.ReadOnly = true + instance.LogBinEnabled = true + instance.LogReplicationUpdatesEnabled = true + resolvedHostname = instance.Key.Hostname + latency.Start("backend") + UpdateResolvedHostname(resolvedHostname, resolvedHostname) + latency.Stop("backend") + isMaxScale = true + } + return nil + }) + latency.Stop("instance") + + // Detect failed connection attempts and don't report the command + // we are executing as that might be confusing. + if err != nil { + if strings.Contains(err.Error(), error1045AccessDenied) { + accessDeniedCounter.Inc(1) + } + if unrecoverableError(err) { + logReadTopologyInstanceError(&instance.Key, "", err) + } else { + logReadTopologyInstanceError(&instance.Key, "show variables like 'maxscale%'", err) + } + } + + return isMaxScale, resolvedHostname, err +} + +// expectReplicationThreadsState expects both replication threads to be running, or both to be not running. +// Specifically, it looks for both to be "Yes" or for both to be "No". +func expectReplicationThreadsState(instanceKey *InstanceKey, expectedState ReplicationThreadState) (expectationMet bool, err error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return false, err + } + err = sqlutils.QueryRowsMap(db, "show slave status", func(m sqlutils.RowMap) error { + ioThreadState := ReplicationThreadStateFromStatus(m.GetString("Slave_IO_Running")) + sqlThreadState := ReplicationThreadStateFromStatus(m.GetString("Slave_SQL_Running")) + + if ioThreadState == expectedState && sqlThreadState == expectedState { + expectationMet = true + } + return nil + }) + return expectationMet, err +} + +// ReadTopologyInstanceBufferable connects to a topology MySQL instance +// and collects information on the server and its replication state. +// It writes the information retrieved into orchestrator's backend. +// - writes are optionally buffered. +// - timing information can be collected for the stages performed. +func ReadTopologyInstanceBufferable(instanceKey *InstanceKey, bufferWrites bool, latency *stopwatch.NamedStopwatch) (inst *Instance, err error) { + defer func() { + if r := recover(); r != nil { + err = logReadTopologyInstanceError(instanceKey, "Unexpected, aborting", fmt.Errorf("%+v", r)) + } + }() + + var waitGroup sync.WaitGroup + var serverUuidWaitGroup sync.WaitGroup + readingStartTime := time.Now() + instance := NewInstance() + instanceFound := false + partialSuccess := false + foundByShowSlaveHosts := false + resolvedHostname := "" + maxScaleMasterHostname := "" + isMaxScale := false + isMaxScale110 := false + slaveStatusFound := false + errorChan := make(chan error, 32) + var resolveErr error + + if !instanceKey.IsValid() { + latency.Start("backend") + if err := UpdateInstanceLastAttemptedCheck(instanceKey); err != nil { + log.Errorf("ReadTopologyInstanceBufferable: %+v: %v", instanceKey, err) + } + latency.Stop("backend") + return instance, fmt.Errorf("ReadTopologyInstance will not act on invalid instance key: %+v", *instanceKey) + } + + lastAttemptedCheckTimer := time.AfterFunc(time.Second, func() { + go UpdateInstanceLastAttemptedCheck(instanceKey) + }) + + latency.Start("instance") + db, err := db.OpenDiscovery(instanceKey.Hostname, instanceKey.Port) + latency.Stop("instance") + if err != nil { + goto Cleanup + } + + instance.Key = *instanceKey + + if isMaxScale, resolvedHostname, err = instance.checkMaxScale(db, latency); err != nil { + // We do not "goto Cleanup" here, although it should be the correct flow. + // Reason is 5.7's new security feature that requires GRANTs on performance_schema.session_variables. + // There is a wrong decision making in this design and the migration path to 5.7 will be difficult. + // I don't want orchestrator to put even more burden on this. + // If the statement errors, then we are unable to determine that this is maxscale, hence assume it is not. + // In which case there would be other queries sent to the server that are not affected by 5.7 behavior, and that will fail. + + // Certain errors are not recoverable (for this discovery process) so it's fine to go to Cleanup + if unrecoverableError(err) { + goto Cleanup + } + } + + latency.Start("instance") + if isMaxScale { + if strings.Contains(instance.Version, "1.1.0") { + isMaxScale110 = true + + // Buggy buggy maxscale 1.1.0. Reported Master_Host can be corrupted. + // Therefore we (currently) take @@hostname (which is masquerading as master host anyhow) + err = db.QueryRow("select @@hostname").Scan(&maxScaleMasterHostname) + if err != nil { + goto Cleanup + } + } + if isMaxScale110 { + // Only this is supported: + db.QueryRow("select @@server_id").Scan(&instance.ServerID) + } else { + db.QueryRow("select @@global.server_id").Scan(&instance.ServerID) + db.QueryRow("select @@global.server_uuid").Scan(&instance.ServerUUID) + } + } else { + // NOT MaxScale + + // We begin with a few operations we can run concurrently, and which do not depend on anything + { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + var dummy string + // show global status works just as well with 5.6 & 5.7 (5.7 moves variables to performance_schema) + err := db.QueryRow("show global status like 'Uptime'").Scan(&dummy, &instance.Uptime) + + if err != nil { + logReadTopologyInstanceError(instanceKey, "show global status like 'Uptime'", err) + + // We do not "goto Cleanup" here, although it should be the correct flow. + // Reason is 5.7's new security feature that requires GRANTs on performance_schema.global_variables. + // There is a wrong decisionmaking in this design and the migration path to 5.7 will be difficult. + // I don't want orchestrator to put even more burden on this. The 'Uptime' variable is not that important + // so as to completely fail reading a 5.7 instance. + // This is supposed to be fixed in 5.7.9 + } + errorChan <- err + }() + } + + var mysqlHostname, mysqlReportHost string + err = db.QueryRow("select @@global.hostname, ifnull(@@global.report_host, ''), @@global.server_id, @@global.version, @@global.version_comment, @@global.read_only, @@global.binlog_format, @@global.log_bin, @@global.log_slave_updates").Scan( + &mysqlHostname, &mysqlReportHost, &instance.ServerID, &instance.Version, &instance.VersionComment, &instance.ReadOnly, &instance.Binlog_format, &instance.LogBinEnabled, &instance.LogReplicationUpdatesEnabled) + if err != nil { + goto Cleanup + } + partialSuccess = true // We at least managed to read something from the server. + switch strings.ToLower(config.Config.MySQLHostnameResolveMethod) { + case "none": + resolvedHostname = instance.Key.Hostname + case "default", "hostname", "@@hostname": + resolvedHostname = mysqlHostname + case "report_host", "@@report_host": + if mysqlReportHost == "" { + err = fmt.Errorf("MySQLHostnameResolveMethod configured to use @@report_host but %+v has NULL/empty @@report_host", instanceKey) + goto Cleanup + } + resolvedHostname = mysqlReportHost + default: + resolvedHostname = instance.Key.Hostname + } + + if instance.LogBinEnabled { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := sqlutils.QueryRowsMap(db, "show master status", func(m sqlutils.RowMap) error { + var err error + instance.SelfBinlogCoordinates.LogFile = m.GetString("File") + instance.SelfBinlogCoordinates.LogPos = m.GetInt64("Position") + return err + }) + errorChan <- err + }() + } + + { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + semiSyncMasterPluginLoaded := false + semiSyncReplicaPluginLoaded := false + err := sqlutils.QueryRowsMap(db, "show global variables like 'rpl_semi_sync_%'", func(m sqlutils.RowMap) error { + if m.GetString("Variable_name") == "rpl_semi_sync_master_enabled" { + instance.SemiSyncMasterEnabled = (m.GetString("Value") == "ON") + semiSyncMasterPluginLoaded = true + } else if m.GetString("Variable_name") == "rpl_semi_sync_master_timeout" { + instance.SemiSyncMasterTimeout = m.GetUint64("Value") + } else if m.GetString("Variable_name") == "rpl_semi_sync_master_wait_for_slave_count" { + instance.SemiSyncMasterWaitForReplicaCount = m.GetUint("Value") + } else if m.GetString("Variable_name") == "rpl_semi_sync_slave_enabled" { + instance.SemiSyncReplicaEnabled = (m.GetString("Value") == "ON") + semiSyncReplicaPluginLoaded = true + } + return nil + }) + instance.SemiSyncAvailable = (semiSyncMasterPluginLoaded && semiSyncReplicaPluginLoaded) + errorChan <- err + }() + } + { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := sqlutils.QueryRowsMap(db, "show global status like 'rpl_semi_sync_%'", func(m sqlutils.RowMap) error { + if m.GetString("Variable_name") == "Rpl_semi_sync_master_status" { + instance.SemiSyncMasterStatus = (m.GetString("Value") == "ON") + } else if m.GetString("Variable_name") == "Rpl_semi_sync_master_clients" { + instance.SemiSyncMasterClients = m.GetUint("Value") + } else if m.GetString("Variable_name") == "Rpl_semi_sync_slave_status" { + instance.SemiSyncReplicaStatus = (m.GetString("Value") == "ON") + } + + return nil + }) + errorChan <- err + }() + } + if (instance.IsOracleMySQL() || instance.IsPercona()) && !instance.IsSmallerMajorVersionByString("5.6") { + waitGroup.Add(1) + serverUuidWaitGroup.Add(1) + go func() { + defer waitGroup.Done() + defer serverUuidWaitGroup.Done() + var masterInfoRepositoryOnTable bool + // Stuff only supported on Oracle MySQL >= 5.6 + // ... + // @@gtid_mode only available in Orcale MySQL >= 5.6 + // Previous version just issued this query brute-force, but I don't like errors being issued where they shouldn't. + _ = db.QueryRow("select @@global.gtid_mode, @@global.server_uuid, @@global.gtid_executed, @@global.gtid_purged, @@global.master_info_repository = 'TABLE', @@global.binlog_row_image").Scan(&instance.GTIDMode, &instance.ServerUUID, &instance.ExecutedGtidSet, &instance.GtidPurged, &masterInfoRepositoryOnTable, &instance.BinlogRowImage) + if instance.GTIDMode != "" && instance.GTIDMode != "OFF" { + instance.SupportsOracleGTID = true + } + if config.Config.ReplicationCredentialsQuery != "" { + instance.ReplicationCredentialsAvailable = true + } else if masterInfoRepositoryOnTable { + _ = db.QueryRow("select count(*) > 0 and MAX(User_name) != '' from mysql.slave_master_info").Scan(&instance.ReplicationCredentialsAvailable) + } + }() + } + } + if resolvedHostname != instance.Key.Hostname { + latency.Start("backend") + UpdateResolvedHostname(instance.Key.Hostname, resolvedHostname) + latency.Stop("backend") + instance.Key.Hostname = resolvedHostname + } + if instance.Key.Hostname == "" { + err = fmt.Errorf("ReadTopologyInstance: empty hostname (%+v). Bailing out", *instanceKey) + goto Cleanup + } + go ResolveHostnameIPs(instance.Key.Hostname) + if config.Config.DataCenterPattern != "" { + if pattern, err := regexp.Compile(config.Config.DataCenterPattern); err == nil { + match := pattern.FindStringSubmatch(instance.Key.Hostname) + if len(match) != 0 { + instance.DataCenter = match[1] + } + } + // This can be overriden by later invocation of DetectDataCenterQuery + } + if config.Config.RegionPattern != "" { + if pattern, err := regexp.Compile(config.Config.RegionPattern); err == nil { + match := pattern.FindStringSubmatch(instance.Key.Hostname) + if len(match) != 0 { + instance.Region = match[1] + } + } + // This can be overriden by later invocation of DetectRegionQuery + } + if config.Config.PhysicalEnvironmentPattern != "" { + if pattern, err := regexp.Compile(config.Config.PhysicalEnvironmentPattern); err == nil { + match := pattern.FindStringSubmatch(instance.Key.Hostname) + if len(match) != 0 { + instance.PhysicalEnvironment = match[1] + } + } + // This can be overriden by later invocation of DetectPhysicalEnvironmentQuery + } + + instance.ReplicationIOThreadState = ReplicationThreadStateNoThread + instance.ReplicationSQLThreadState = ReplicationThreadStateNoThread + err = sqlutils.QueryRowsMap(db, "show slave status", func(m sqlutils.RowMap) error { + instance.HasReplicationCredentials = (m.GetString("Master_User") != "") + instance.ReplicationIOThreadState = ReplicationThreadStateFromStatus(m.GetString("Slave_IO_Running")) + instance.ReplicationSQLThreadState = ReplicationThreadStateFromStatus(m.GetString("Slave_SQL_Running")) + instance.ReplicationIOThreadRuning = instance.ReplicationIOThreadState.IsRunning() + if isMaxScale110 { + // Covering buggy MaxScale 1.1.0 + instance.ReplicationIOThreadRuning = instance.ReplicationIOThreadRuning && (m.GetString("Slave_IO_State") == "Binlog Dump") + } + instance.ReplicationSQLThreadRuning = instance.ReplicationSQLThreadState.IsRunning() + instance.ReadBinlogCoordinates.LogFile = m.GetString("Master_Log_File") + instance.ReadBinlogCoordinates.LogPos = m.GetInt64("Read_Master_Log_Pos") + instance.ExecBinlogCoordinates.LogFile = m.GetString("Relay_Master_Log_File") + instance.ExecBinlogCoordinates.LogPos = m.GetInt64("Exec_Master_Log_Pos") + instance.IsDetached, _ = instance.ExecBinlogCoordinates.ExtractDetachedCoordinates() + instance.RelaylogCoordinates.LogFile = m.GetString("Relay_Log_File") + instance.RelaylogCoordinates.LogPos = m.GetInt64("Relay_Log_Pos") + instance.RelaylogCoordinates.Type = RelayLog + instance.LastSQLError = emptyQuotesRegexp.ReplaceAllString(strconv.QuoteToASCII(m.GetString("Last_SQL_Error")), "") + instance.LastIOError = emptyQuotesRegexp.ReplaceAllString(strconv.QuoteToASCII(m.GetString("Last_IO_Error")), "") + instance.SQLDelay = m.GetUintD("SQL_Delay", 0) + instance.UsingOracleGTID = (m.GetIntD("Auto_Position", 0) == 1) + instance.UsingMariaDBGTID = (m.GetStringD("Using_Gtid", "No") != "No") + instance.MasterUUID = m.GetStringD("Master_UUID", "No") + instance.HasReplicationFilters = ((m.GetStringD("Replicate_Do_DB", "") != "") || (m.GetStringD("Replicate_Ignore_DB", "") != "") || (m.GetStringD("Replicate_Do_Table", "") != "") || (m.GetStringD("Replicate_Ignore_Table", "") != "") || (m.GetStringD("Replicate_Wild_Do_Table", "") != "") || (m.GetStringD("Replicate_Wild_Ignore_Table", "") != "")) + + masterHostname := m.GetString("Master_Host") + if isMaxScale110 { + // Buggy buggy maxscale 1.1.0. Reported Master_Host can be corrupted. + // Therefore we (currently) take @@hostname (which is masquarading as master host anyhow) + masterHostname = maxScaleMasterHostname + } + masterKey, err := NewResolveInstanceKey(masterHostname, m.GetInt("Master_Port")) + if err != nil { + logReadTopologyInstanceError(instanceKey, "NewResolveInstanceKey", err) + } + masterKey.Hostname, resolveErr = ResolveHostname(masterKey.Hostname) + if resolveErr != nil { + logReadTopologyInstanceError(instanceKey, fmt.Sprintf("ResolveHostname(%q)", masterKey.Hostname), resolveErr) + } + instance.MasterKey = *masterKey + instance.IsDetachedMaster = instance.MasterKey.IsDetached() + instance.SecondsBehindMaster = m.GetNullInt64("Seconds_Behind_Master") + if instance.SecondsBehindMaster.Valid && instance.SecondsBehindMaster.Int64 < 0 { + log.Warningf("Host: %+v, instance.SecondsBehindMaster < 0 [%+v], correcting to 0", instanceKey, instance.SecondsBehindMaster.Int64) + instance.SecondsBehindMaster.Int64 = 0 + } + // And until told otherwise: + instance.ReplicationLagSeconds = instance.SecondsBehindMaster + + instance.AllowTLS = (m.GetString("Master_SSL_Allowed") == "Yes") + // Not breaking the flow even on error + slaveStatusFound = true + return nil + }) + if err != nil { + goto Cleanup + } + // Populate GR information for the instance in Oracle MySQL 8.0+. To do this we need to wait for the Server UUID to + // be populated to be able to find this instance's information in performance_schema.replication_group_members by + // comparing UUIDs. We could instead resolve the MEMBER_HOST and MEMBER_PORT columns into an InstanceKey and compare + // those instead, but this could require external calls for name resolving, whereas comparing UUIDs does not. + serverUuidWaitGroup.Wait() + if instance.IsOracleMySQL() && !instance.IsSmallerMajorVersionByString("8.0") { + err := PopulateGroupReplicationInformation(instance, db) + if err != nil { + goto Cleanup + } + } + if isMaxScale && !slaveStatusFound { + err = fmt.Errorf("No 'SHOW SLAVE STATUS' output found for a MaxScale instance: %+v", instanceKey) + goto Cleanup + } + + if config.Config.ReplicationLagQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + if err := db.QueryRow(config.Config.ReplicationLagQuery).Scan(&instance.ReplicationLagSeconds); err == nil { + if instance.ReplicationLagSeconds.Valid && instance.ReplicationLagSeconds.Int64 < 0 { + log.Warningf("Host: %+v, instance.SlaveLagSeconds < 0 [%+v], correcting to 0", instanceKey, instance.ReplicationLagSeconds.Int64) + instance.ReplicationLagSeconds.Int64 = 0 + } + } else { + instance.ReplicationLagSeconds = instance.SecondsBehindMaster + logReadTopologyInstanceError(instanceKey, "ReplicationLagQuery", err) + } + }() + } + + instanceFound = true + + // ------------------------------------------------------------------------- + // Anything after this point does not affect the fact the instance is found. + // No `goto Cleanup` after this point. + // ------------------------------------------------------------------------- + + // Get replicas, either by SHOW SLAVE HOSTS or via PROCESSLIST + // MaxScale does not support PROCESSLIST, so SHOW SLAVE HOSTS is the only option + if config.Config.DiscoverByShowSlaveHosts || isMaxScale { + err := sqlutils.QueryRowsMap(db, `show slave hosts`, + func(m sqlutils.RowMap) error { + // MaxScale 1.1 may trigger an error with this command, but + // also we may see issues if anything on the MySQL server locks up. + // Consequently it's important to validate the values received look + // good prior to calling ResolveHostname() + host := m.GetString("Host") + port := m.GetIntD("Port", 0) + if host == "" || port == 0 { + if isMaxScale && host == "" && port == 0 { + // MaxScale reports a bad response sometimes so ignore it. + // - seen in 1.1.0 and 1.4.3.4 + return nil + } + // otherwise report the error to the caller + return fmt.Errorf("ReadTopologyInstance(%+v) 'show slave hosts' returned row with : <%v,%v>", instanceKey, host, port) + } + + replicaKey, err := NewResolveInstanceKey(host, port) + if err == nil && replicaKey.IsValid() { + if !RegexpMatchPatterns(replicaKey.StringCode(), config.Config.DiscoveryIgnoreReplicaHostnameFilters) { + instance.AddReplicaKey(replicaKey) + } + foundByShowSlaveHosts = true + } + return err + }) + + logReadTopologyInstanceError(instanceKey, "show slave hosts", err) + } + if !foundByShowSlaveHosts && !isMaxScale { + // Either not configured to read SHOW SLAVE HOSTS or nothing was there. + // Discover by information_schema.processlist + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := sqlutils.QueryRowsMap(db, ` + select + substring_index(host, ':', 1) as slave_hostname + from + information_schema.processlist + where + command IN ('Binlog Dump', 'Binlog Dump GTID') + `, + func(m sqlutils.RowMap) error { + cname, resolveErr := ResolveHostname(m.GetString("slave_hostname")) + if resolveErr != nil { + logReadTopologyInstanceError(instanceKey, "ResolveHostname: processlist", resolveErr) + } + replicaKey := InstanceKey{Hostname: cname, Port: instance.Key.Port} + if !RegexpMatchPatterns(replicaKey.StringCode(), config.Config.DiscoveryIgnoreReplicaHostnameFilters) { + instance.AddReplicaKey(&replicaKey) + } + return err + }) + + logReadTopologyInstanceError(instanceKey, "processlist", err) + }() + } + + if instance.IsNDB() { + // Discover by ndbinfo about MySQL Cluster SQL nodes + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := sqlutils.QueryRowsMap(db, ` + select + substring(service_URI,9) mysql_host + from + ndbinfo.processes + where + process_name='mysqld' + `, + func(m sqlutils.RowMap) error { + cname, resolveErr := ResolveHostname(m.GetString("mysql_host")) + if resolveErr != nil { + logReadTopologyInstanceError(instanceKey, "ResolveHostname: ndbinfo", resolveErr) + } + replicaKey := InstanceKey{Hostname: cname, Port: instance.Key.Port} + instance.AddReplicaKey(&replicaKey) + return err + }) + + logReadTopologyInstanceError(instanceKey, "ndbinfo", err) + }() + } + + if config.Config.DetectDataCenterQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := db.QueryRow(config.Config.DetectDataCenterQuery).Scan(&instance.DataCenter) + logReadTopologyInstanceError(instanceKey, "DetectDataCenterQuery", err) + }() + } + + if config.Config.DetectRegionQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := db.QueryRow(config.Config.DetectRegionQuery).Scan(&instance.Region) + logReadTopologyInstanceError(instanceKey, "DetectRegionQuery", err) + }() + } + + if config.Config.DetectPhysicalEnvironmentQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := db.QueryRow(config.Config.DetectPhysicalEnvironmentQuery).Scan(&instance.PhysicalEnvironment) + logReadTopologyInstanceError(instanceKey, "DetectPhysicalEnvironmentQuery", err) + }() + } + + if config.Config.DetectInstanceAliasQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := db.QueryRow(config.Config.DetectInstanceAliasQuery).Scan(&instance.InstanceAlias) + logReadTopologyInstanceError(instanceKey, "DetectInstanceAliasQuery", err) + }() + } + + if config.Config.DetectSemiSyncEnforcedQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + err := db.QueryRow(config.Config.DetectSemiSyncEnforcedQuery).Scan(&instance.SemiSyncEnforced) + logReadTopologyInstanceError(instanceKey, "DetectSemiSyncEnforcedQuery", err) + }() + } + + { + latency.Start("backend") + err = ReadInstanceClusterAttributes(instance) + latency.Stop("backend") + logReadTopologyInstanceError(instanceKey, "ReadInstanceClusterAttributes", err) + } + + { + // Pseudo GTID + // Depends on ReadInstanceClusterAttributes above + instance.UsingPseudoGTID = false + if config.Config.AutoPseudoGTID { + var err error + instance.UsingPseudoGTID, err = isInjectedPseudoGTID(instance.ClusterName) + log.Errore(err) + } else if config.Config.DetectPseudoGTIDQuery != "" { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + if resultData, err := sqlutils.QueryResultData(db, config.Config.DetectPseudoGTIDQuery); err == nil { + if len(resultData) > 0 { + if len(resultData[0]) > 0 { + if resultData[0][0].Valid && resultData[0][0].String == "1" { + instance.UsingPseudoGTID = true + } + } + } + } else { + logReadTopologyInstanceError(instanceKey, "DetectPseudoGTIDQuery", err) + } + }() + } + } + + // First read the current PromotionRule from candidate_database_instance. + { + latency.Start("backend") + err = ReadInstancePromotionRule(instance) + latency.Stop("backend") + logReadTopologyInstanceError(instanceKey, "ReadInstancePromotionRule", err) + } + // Then check if the instance wants to set a different PromotionRule. + // We'll set it here on their behalf so there's no race between the first + // time an instance is discovered, and setting a rule like "must_not". + if config.Config.DetectPromotionRuleQuery != "" && !isMaxScale { + waitGroup.Add(1) + go func() { + defer waitGroup.Done() + var value string + err := db.QueryRow(config.Config.DetectPromotionRuleQuery).Scan(&value) + logReadTopologyInstanceError(instanceKey, "DetectPromotionRuleQuery", err) + promotionRule, err := ParseCandidatePromotionRule(value) + logReadTopologyInstanceError(instanceKey, "ParseCandidatePromotionRule", err) + if err == nil { + // We need to update candidate_database_instance. + // We register the rule even if it hasn't changed, + // to bump the last_suggested time. + instance.PromotionRule = promotionRule + err = RegisterCandidateInstance(NewCandidateDatabaseInstance(instanceKey, promotionRule).WithCurrentTime()) + logReadTopologyInstanceError(instanceKey, "RegisterCandidateInstance", err) + } + }() + } + + ReadClusterAliasOverride(instance) + if !isMaxScale { + if instance.SuggestedClusterAlias == "" { + // Only need to do on masters + if config.Config.DetectClusterAliasQuery != "" { + clusterAlias := "" + if err := db.QueryRow(config.Config.DetectClusterAliasQuery).Scan(&clusterAlias); err != nil { + logReadTopologyInstanceError(instanceKey, "DetectClusterAliasQuery", err) + } else { + instance.SuggestedClusterAlias = clusterAlias + } + } + } + if instance.SuggestedClusterAlias == "" { + // Not found by DetectClusterAliasQuery... + // See if a ClusterNameToAlias configuration applies + if clusterAlias := mappedClusterNameToAlias(instance.ClusterName); clusterAlias != "" { + instance.SuggestedClusterAlias = clusterAlias + } + } + } + if instance.ReplicationDepth == 0 && config.Config.DetectClusterDomainQuery != "" && !isMaxScale { + // Only need to do on masters + domainName := "" + if err := db.QueryRow(config.Config.DetectClusterDomainQuery).Scan(&domainName); err != nil { + domainName = "" + logReadTopologyInstanceError(instanceKey, "DetectClusterDomainQuery", err) + } + if domainName != "" { + latency.Start("backend") + err := WriteClusterDomainName(instance.ClusterName, domainName) + latency.Stop("backend") + logReadTopologyInstanceError(instanceKey, "WriteClusterDomainName", err) + } + } + +Cleanup: + waitGroup.Wait() + close(errorChan) + err = func() error { + if err != nil { + return err + } + + for err := range errorChan { + if err != nil { + return err + } + } + return nil + }() + + if instanceFound { + if instance.IsCoMaster { + // Take co-master into account, and avoid infinite loop + instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.MasterUUID, instance.ServerUUID) + } else { + instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.AncestryUUID, instance.ServerUUID) + } + // Add replication group ancestry UUID as well. Otherwise, Orchestrator thinks there are errant GTIDs in group + // members and its slaves, even though they are not. + instance.AncestryUUID = fmt.Sprintf("%s,%s", instance.AncestryUUID, instance.ReplicationGroupName) + instance.AncestryUUID = strings.Trim(instance.AncestryUUID, ",") + if instance.ExecutedGtidSet != "" && instance.masterExecutedGtidSet != "" { + // Compare master & replica GTID sets, but ignore the sets that present the master's UUID. + // This is because orchestrator may pool master and replica at an inconvenient timing, + // such that the replica may _seems_ to have more entries than the master, when in fact + // it's just that the master's probing is stale. + redactedExecutedGtidSet, _ := NewOracleGtidSet(instance.ExecutedGtidSet) + for _, uuid := range strings.Split(instance.AncestryUUID, ",") { + if uuid != instance.ServerUUID { + redactedExecutedGtidSet.RemoveUUID(uuid) + } + if instance.IsCoMaster && uuid == instance.ServerUUID { + // If this is a co-master, then this server is likely to show its own generated GTIDs as errant, + // because its co-master has not applied them yet + redactedExecutedGtidSet.RemoveUUID(uuid) + } + } + // Avoid querying the database if there's no point: + if !redactedExecutedGtidSet.IsEmpty() { + redactedMasterExecutedGtidSet, _ := NewOracleGtidSet(instance.masterExecutedGtidSet) + redactedMasterExecutedGtidSet.RemoveUUID(instance.MasterUUID) + + db.QueryRow("select gtid_subtract(?, ?)", redactedExecutedGtidSet.String(), redactedMasterExecutedGtidSet.String()).Scan(&instance.GtidErrant) + } + } + } + + latency.Stop("instance") + readTopologyInstanceCounter.Inc(1) + + if instanceFound { + instance.LastDiscoveryLatency = time.Since(readingStartTime) + instance.IsLastCheckValid = true + instance.IsRecentlyChecked = true + instance.IsUpToDate = true + latency.Start("backend") + if bufferWrites { + enqueueInstanceWrite(instance, instanceFound, err) + } else { + WriteInstance(instance, instanceFound, err) + } + lastAttemptedCheckTimer.Stop() + latency.Stop("backend") + return instance, nil + } + + // Something is wrong, could be network-wise. Record that we + // tried to check the instance. last_attempted_check is also + // updated on success by writeInstance. + latency.Start("backend") + _ = UpdateInstanceLastChecked(&instance.Key, partialSuccess) + latency.Stop("backend") + return nil, err +} + +// ReadClusterAliasOverride reads and applies SuggestedClusterAlias based on cluster_alias_override +func ReadClusterAliasOverride(instance *Instance) (err error) { + aliasOverride := "" + query := ` + select + alias + from + cluster_alias_override + where + cluster_name = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(instance.ClusterName), func(m sqlutils.RowMap) error { + aliasOverride = m.GetString("alias") + + return nil + }) + if aliasOverride != "" { + instance.SuggestedClusterAlias = aliasOverride + } + return err +} + +func ReadReplicationGroupPrimary(instance *Instance) (err error) { + query := ` + SELECT + replication_group_primary_host, + replication_group_primary_port + FROM + database_instance + WHERE + replication_group_name = ? + AND replication_group_member_role = 'PRIMARY' +` + queryArgs := sqlutils.Args(instance.ReplicationGroupName) + err = db.QueryOrchestrator(query, queryArgs, func(row sqlutils.RowMap) error { + groupPrimaryHost := row.GetString("replication_group_primary_host") + groupPrimaryPort := row.GetInt("replication_group_primary_port") + resolvedGroupPrimary, err := NewResolveInstanceKey(groupPrimaryHost, groupPrimaryPort) + if err != nil { + return err + } + instance.ReplicationGroupPrimaryInstanceKey = *resolvedGroupPrimary + return nil + }) + return err +} + +// ReadInstanceClusterAttributes will return the cluster name for a given instance by looking at its master +// and getting it from there. +// It is a non-recursive function and so-called-recursion is performed upon periodic reading of +// instances. +func ReadInstanceClusterAttributes(instance *Instance) (err error) { + var masterOrGroupPrimaryInstanceKey InstanceKey + var masterOrGroupPrimaryClusterName string + var masterOrGroupPrimarySuggestedClusterAlias string + var masterOrGroupPrimaryReplicationDepth uint + var ancestryUUID string + var masterOrGroupPrimaryExecutedGtidSet string + masterOrGroupPrimaryDataFound := false + + // Read the cluster_name of the _master_ or _group_primary_ of our instance, derive it from there. + query := ` + select + cluster_name, + suggested_cluster_alias, + replication_depth, + master_host, + master_port, + ancestry_uuid, + executed_gtid_set + from database_instance + where hostname=? and port=? + ` + // For instances that are part of a replication group, if the host is not the group's primary, we use the + // information from the group primary. If it is the group primary, we use the information of its master + // (if it has any). If it is not a group member, we use the information from the host's master. + if instance.IsReplicationGroupSecondary() { + masterOrGroupPrimaryInstanceKey = instance.ReplicationGroupPrimaryInstanceKey + } else { + masterOrGroupPrimaryInstanceKey = instance.MasterKey + } + args := sqlutils.Args(masterOrGroupPrimaryInstanceKey.Hostname, masterOrGroupPrimaryInstanceKey.Port) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + masterOrGroupPrimaryClusterName = m.GetString("cluster_name") + masterOrGroupPrimarySuggestedClusterAlias = m.GetString("suggested_cluster_alias") + masterOrGroupPrimaryReplicationDepth = m.GetUint("replication_depth") + masterOrGroupPrimaryInstanceKey.Hostname = m.GetString("master_host") + masterOrGroupPrimaryInstanceKey.Port = m.GetInt("master_port") + ancestryUUID = m.GetString("ancestry_uuid") + masterOrGroupPrimaryExecutedGtidSet = m.GetString("executed_gtid_set") + masterOrGroupPrimaryDataFound = true + return nil + }) + if err != nil { + return log.Errore(err) + } + + var replicationDepth uint = 0 + var clusterName string + if masterOrGroupPrimaryDataFound { + replicationDepth = masterOrGroupPrimaryReplicationDepth + 1 + clusterName = masterOrGroupPrimaryClusterName + } + clusterNameByInstanceKey := instance.Key.StringCode() + if clusterName == "" { + // Nothing from master; we set it to be named after the instance itself + clusterName = clusterNameByInstanceKey + } + + isCoMaster := false + if masterOrGroupPrimaryInstanceKey.Equals(&instance.Key) { + // co-master calls for special case, in fear of the infinite loop + isCoMaster = true + clusterNameByCoMasterKey := instance.MasterKey.StringCode() + if clusterName != clusterNameByInstanceKey && clusterName != clusterNameByCoMasterKey { + // Can be caused by a co-master topology failover + log.Errorf("ReadInstanceClusterAttributes: in co-master topology %s is not in (%s, %s). Forcing it to become one of them", clusterName, clusterNameByInstanceKey, clusterNameByCoMasterKey) + clusterName = math.TernaryString(instance.Key.SmallerThan(&instance.MasterKey), clusterNameByInstanceKey, clusterNameByCoMasterKey) + } + if clusterName == clusterNameByInstanceKey { + // circular replication. Avoid infinite ++ on replicationDepth + replicationDepth = 0 + ancestryUUID = "" + } // While the other stays "1" + } + instance.ClusterName = clusterName + instance.SuggestedClusterAlias = masterOrGroupPrimarySuggestedClusterAlias + instance.ReplicationDepth = replicationDepth + instance.IsCoMaster = isCoMaster + instance.AncestryUUID = ancestryUUID + instance.masterExecutedGtidSet = masterOrGroupPrimaryExecutedGtidSet + return nil +} + +type byNamePort [](*InstanceKey) + +func (this byNamePort) Len() int { return len(this) } +func (this byNamePort) Swap(i, j int) { this[i], this[j] = this[j], this[i] } +func (this byNamePort) Less(i, j int) bool { + return (this[i].Hostname < this[j].Hostname) || + (this[i].Hostname == this[j].Hostname && this[i].Port < this[j].Port) +} + +// BulkReadInstance returns a list of all instances from the database +// - I only need the Hostname and Port fields. +// - I must use readInstancesByCondition to ensure all column +// settings are correct. +func BulkReadInstance() ([](*InstanceKey), error) { + // no condition (I want all rows) and no sorting (but this is done by Hostname, Port anyway) + const ( + condition = "1=1" + orderBy = "" + ) + var instanceKeys [](*InstanceKey) + + instances, err := readInstancesByCondition(condition, nil, orderBy) + if err != nil { + return nil, fmt.Errorf("BulkReadInstance: %+v", err) + } + + // update counters if we picked anything up + if len(instances) > 0 { + readInstanceCounter.Inc(int64(len(instances))) + + for _, instance := range instances { + instanceKeys = append(instanceKeys, &instance.Key) + } + // sort on orchestrator and not the backend (should be redundant) + sort.Sort(byNamePort(instanceKeys)) + } + + return instanceKeys, nil +} + +func ReadInstancePromotionRule(instance *Instance) (err error) { + var promotionRule CandidatePromotionRule = NeutralPromoteRule + query := ` + select + ifnull(nullif(promotion_rule, ''), 'neutral') as promotion_rule + from candidate_database_instance + where hostname=? and port=? + ` + args := sqlutils.Args(instance.Key.Hostname, instance.Key.Port) + + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + promotionRule = CandidatePromotionRule(m.GetString("promotion_rule")) + return nil + }) + instance.PromotionRule = promotionRule + return log.Errore(err) +} + +// readInstanceRow reads a single instance row from the orchestrator backend database. +func readInstanceRow(m sqlutils.RowMap) *Instance { + instance := NewInstance() + + instance.Key.Hostname = m.GetString("hostname") + instance.Key.Port = m.GetInt("port") + instance.Uptime = m.GetUint("uptime") + instance.ServerID = m.GetUint("server_id") + instance.ServerUUID = m.GetString("server_uuid") + instance.Version = m.GetString("version") + instance.VersionComment = m.GetString("version_comment") + instance.ReadOnly = m.GetBool("read_only") + instance.Binlog_format = m.GetString("binlog_format") + instance.BinlogRowImage = m.GetString("binlog_row_image") + instance.LogBinEnabled = m.GetBool("log_bin") + instance.LogReplicationUpdatesEnabled = m.GetBool("log_slave_updates") + instance.MasterKey.Hostname = m.GetString("master_host") + instance.MasterKey.Port = m.GetInt("master_port") + instance.IsDetachedMaster = instance.MasterKey.IsDetached() + instance.ReplicationSQLThreadRuning = m.GetBool("slave_sql_running") + instance.ReplicationIOThreadRuning = m.GetBool("slave_io_running") + instance.ReplicationSQLThreadState = ReplicationThreadState(m.GetInt("replication_sql_thread_state")) + instance.ReplicationIOThreadState = ReplicationThreadState(m.GetInt("replication_io_thread_state")) + instance.HasReplicationFilters = m.GetBool("has_replication_filters") + instance.SupportsOracleGTID = m.GetBool("supports_oracle_gtid") + instance.UsingOracleGTID = m.GetBool("oracle_gtid") + instance.MasterUUID = m.GetString("master_uuid") + instance.AncestryUUID = m.GetString("ancestry_uuid") + instance.ExecutedGtidSet = m.GetString("executed_gtid_set") + instance.GTIDMode = m.GetString("gtid_mode") + instance.GtidPurged = m.GetString("gtid_purged") + instance.GtidErrant = m.GetString("gtid_errant") + instance.UsingMariaDBGTID = m.GetBool("mariadb_gtid") + instance.UsingPseudoGTID = m.GetBool("pseudo_gtid") + instance.SelfBinlogCoordinates.LogFile = m.GetString("binary_log_file") + instance.SelfBinlogCoordinates.LogPos = m.GetInt64("binary_log_pos") + instance.ReadBinlogCoordinates.LogFile = m.GetString("master_log_file") + instance.ReadBinlogCoordinates.LogPos = m.GetInt64("read_master_log_pos") + instance.ExecBinlogCoordinates.LogFile = m.GetString("relay_master_log_file") + instance.ExecBinlogCoordinates.LogPos = m.GetInt64("exec_master_log_pos") + instance.IsDetached, _ = instance.ExecBinlogCoordinates.ExtractDetachedCoordinates() + instance.RelaylogCoordinates.LogFile = m.GetString("relay_log_file") + instance.RelaylogCoordinates.LogPos = m.GetInt64("relay_log_pos") + instance.RelaylogCoordinates.Type = RelayLog + instance.LastSQLError = m.GetString("last_sql_error") + instance.LastIOError = m.GetString("last_io_error") + instance.SecondsBehindMaster = m.GetNullInt64("seconds_behind_master") + instance.ReplicationLagSeconds = m.GetNullInt64("slave_lag_seconds") + instance.SQLDelay = m.GetUint("sql_delay") + replicasJSON := m.GetString("slave_hosts") + instance.ClusterName = m.GetString("cluster_name") + instance.SuggestedClusterAlias = m.GetString("suggested_cluster_alias") + instance.DataCenter = m.GetString("data_center") + instance.Region = m.GetString("region") + instance.PhysicalEnvironment = m.GetString("physical_environment") + instance.SemiSyncEnforced = m.GetBool("semi_sync_enforced") + instance.SemiSyncAvailable = m.GetBool("semi_sync_available") + instance.SemiSyncMasterEnabled = m.GetBool("semi_sync_master_enabled") + instance.SemiSyncMasterTimeout = m.GetUint64("semi_sync_master_timeout") + instance.SemiSyncMasterWaitForReplicaCount = m.GetUint("semi_sync_master_wait_for_slave_count") + instance.SemiSyncReplicaEnabled = m.GetBool("semi_sync_replica_enabled") + instance.SemiSyncMasterStatus = m.GetBool("semi_sync_master_status") + instance.SemiSyncMasterClients = m.GetUint("semi_sync_master_clients") + instance.SemiSyncReplicaStatus = m.GetBool("semi_sync_replica_status") + instance.ReplicationDepth = m.GetUint("replication_depth") + instance.IsCoMaster = m.GetBool("is_co_master") + instance.ReplicationCredentialsAvailable = m.GetBool("replication_credentials_available") + instance.HasReplicationCredentials = m.GetBool("has_replication_credentials") + instance.IsUpToDate = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds) + instance.IsRecentlyChecked = (m.GetUint("seconds_since_last_checked") <= config.Config.InstancePollSeconds*5) + instance.LastSeenTimestamp = m.GetString("last_seen") + instance.IsLastCheckValid = m.GetBool("is_last_check_valid") + instance.SecondsSinceLastSeen = m.GetNullInt64("seconds_since_last_seen") + instance.IsCandidate = m.GetBool("is_candidate") + instance.PromotionRule = CandidatePromotionRule(m.GetString("promotion_rule")) + instance.IsDowntimed = m.GetBool("is_downtimed") + instance.DowntimeReason = m.GetString("downtime_reason") + instance.DowntimeOwner = m.GetString("downtime_owner") + instance.DowntimeEndTimestamp = m.GetString("downtime_end_timestamp") + instance.ElapsedDowntime = time.Second * time.Duration(m.GetInt("elapsed_downtime_seconds")) + instance.UnresolvedHostname = m.GetString("unresolved_hostname") + instance.AllowTLS = m.GetBool("allow_tls") + instance.InstanceAlias = m.GetString("instance_alias") + instance.LastDiscoveryLatency = time.Duration(m.GetInt64("last_discovery_latency")) * time.Nanosecond + + instance.Replicas.ReadJson(replicasJSON) + instance.applyFlavorName() + + /* Read Group Replication variables below */ + instance.ReplicationGroupName = m.GetString("replication_group_name") + instance.ReplicationGroupIsSinglePrimary = m.GetBool("replication_group_is_single_primary_mode") + instance.ReplicationGroupMemberState = m.GetString("replication_group_member_state") + instance.ReplicationGroupMemberRole = m.GetString("replication_group_member_role") + instance.ReplicationGroupPrimaryInstanceKey = InstanceKey{Hostname: m.GetString("replication_group_primary_host"), + Port: m.GetInt("replication_group_primary_port")} + instance.ReplicationGroupMembers.ReadJson(m.GetString("replication_group_members")) + //instance.ReplicationGroup = m.GetString("replication_group_") + + // problems + if !instance.IsLastCheckValid { + instance.Problems = append(instance.Problems, "last_check_invalid") + } else if !instance.IsRecentlyChecked { + instance.Problems = append(instance.Problems, "not_recently_checked") + } else if instance.ReplicationThreadsExist() && !instance.ReplicaRunning() { + instance.Problems = append(instance.Problems, "not_replicating") + } else if instance.ReplicationLagSeconds.Valid && math.AbsInt64(instance.ReplicationLagSeconds.Int64-int64(instance.SQLDelay)) > int64(config.Config.ReasonableReplicationLagSeconds) { + instance.Problems = append(instance.Problems, "replication_lag") + } + if instance.GtidErrant != "" { + instance.Problems = append(instance.Problems, "errant_gtid") + } + // Group replication problems + if instance.ReplicationGroupName != "" && instance.ReplicationGroupMemberState != GroupReplicationMemberStateOnline { + instance.Problems = append(instance.Problems, "group_replication_member_not_online") + } + + return instance +} + +// readInstancesByCondition is a generic function to read instances from the backend database +func readInstancesByCondition(condition string, args []interface{}, sort string) ([](*Instance), error) { + readFunc := func() ([](*Instance), error) { + instances := [](*Instance){} + + if sort == "" { + sort = `hostname, port` + } + query := fmt.Sprintf(` + select + *, + unix_timestamp() - unix_timestamp(last_checked) as seconds_since_last_checked, + ifnull(last_checked <= last_seen, 0) as is_last_check_valid, + unix_timestamp() - unix_timestamp(last_seen) as seconds_since_last_seen, + candidate_database_instance.last_suggested is not null + and candidate_database_instance.promotion_rule in ('must', 'prefer') as is_candidate, + ifnull(nullif(candidate_database_instance.promotion_rule, ''), 'neutral') as promotion_rule, + ifnull(unresolved_hostname, '') as unresolved_hostname, + (database_instance_downtime.downtime_active is not null and ifnull(database_instance_downtime.end_timestamp, now()) > now()) as is_downtimed, + ifnull(database_instance_downtime.reason, '') as downtime_reason, + ifnull(database_instance_downtime.owner, '') as downtime_owner, + ifnull(unix_timestamp() - unix_timestamp(begin_timestamp), 0) as elapsed_downtime_seconds, + ifnull(database_instance_downtime.end_timestamp, '') as downtime_end_timestamp + from + database_instance + left join candidate_database_instance using (hostname, port) + left join hostname_unresolve using (hostname) + left join database_instance_downtime using (hostname, port) + where + %s + order by + %s + `, condition, sort) + + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + instance := readInstanceRow(m) + instances = append(instances, instance) + return nil + }) + if err != nil { + return instances, log.Errore(err) + } + err = PopulateInstancesAgents(instances) + if err != nil { + return instances, log.Errore(err) + } + return instances, err + } + instanceReadChan <- true + instances, err := readFunc() + <-instanceReadChan + return instances, err +} + +func readInstancesByExactKey(instanceKey *InstanceKey) ([](*Instance), error) { + condition := ` + hostname = ? + and port = ? + ` + return readInstancesByCondition(condition, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), "") +} + +// ReadInstance reads an instance from the orchestrator backend database +func ReadInstance(instanceKey *InstanceKey) (*Instance, bool, error) { + instances, err := readInstancesByExactKey(instanceKey) + // We know there will be at most one (hostname & port are PK) + // And we expect to find one + readInstanceCounter.Inc(1) + if len(instances) == 0 { + return nil, false, err + } + if err != nil { + return instances[0], false, err + } + return instances[0], true, nil +} + +// ReadClusterInstances reads all instances of a given cluster +func ReadClusterInstances(clusterName string) ([](*Instance), error) { + if strings.Index(clusterName, "'") >= 0 { + return [](*Instance){}, log.Errorf("Invalid cluster name: %s", clusterName) + } + condition := `cluster_name = ?` + return readInstancesByCondition(condition, sqlutils.Args(clusterName), "") +} + +// ReadClusterWriteableMaster returns the/a writeable master of this cluster +// Typically, the cluster name indicates the master of the cluster. However, in circular +// master-master replication one master can assume the name of the cluster, and it is +// not guaranteed that it is the writeable one. +func ReadClusterWriteableMaster(clusterName string) ([](*Instance), error) { + condition := ` + cluster_name = ? + and read_only = 0 + and (replication_depth = 0 or is_co_master) + ` + return readInstancesByCondition(condition, sqlutils.Args(clusterName), "replication_depth asc") +} + +// ReadClusterMaster returns the master of this cluster. +// - if the cluster has co-masters, the/a writable one is returned +// - if the cluster has a single master, that master is retuened whether it is read-only or writable. +func ReadClusterMaster(clusterName string) ([](*Instance), error) { + condition := ` + cluster_name = ? + and (replication_depth = 0 or is_co_master) + ` + return readInstancesByCondition(condition, sqlutils.Args(clusterName), "read_only asc, replication_depth asc") +} + +// ReadWriteableClustersMasters returns writeable masters of all clusters, but only one +// per cluster, in similar logic to ReadClusterWriteableMaster +func ReadWriteableClustersMasters() (instances [](*Instance), err error) { + condition := ` + read_only = 0 + and (replication_depth = 0 or is_co_master) + ` + allMasters, err := readInstancesByCondition(condition, sqlutils.Args(), "cluster_name asc, replication_depth asc") + if err != nil { + return instances, err + } + visitedClusters := make(map[string]bool) + for _, instance := range allMasters { + if !visitedClusters[instance.ClusterName] { + visitedClusters[instance.ClusterName] = true + instances = append(instances, instance) + } + } + return instances, err +} + +// ReadReplicaInstances reads replicas of a given master +func ReadReplicaInstances(masterKey *InstanceKey) ([](*Instance), error) { + condition := ` + master_host = ? + and master_port = ? + ` + return readInstancesByCondition(condition, sqlutils.Args(masterKey.Hostname, masterKey.Port), "") +} + +// ReadReplicaInstancesIncludingBinlogServerSubReplicas returns a list of direct slves including any replicas +// of a binlog server replica +func ReadReplicaInstancesIncludingBinlogServerSubReplicas(masterKey *InstanceKey) ([](*Instance), error) { + replicas, err := ReadReplicaInstances(masterKey) + if err != nil { + return replicas, err + } + for _, replica := range replicas { + replica := replica + if replica.IsBinlogServer() { + binlogServerReplicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(&replica.Key) + if err != nil { + return replicas, err + } + replicas = append(replicas, binlogServerReplicas...) + } + } + return replicas, err +} + +// ReadBinlogServerReplicaInstances reads direct replicas of a given master that are binlog servers +func ReadBinlogServerReplicaInstances(masterKey *InstanceKey) ([](*Instance), error) { + condition := ` + master_host = ? + and master_port = ? + and binlog_server = 1 + ` + return readInstancesByCondition(condition, sqlutils.Args(masterKey.Hostname, masterKey.Port), "") +} + +// ReadUnseenInstances reads all instances which were not recently seen +func ReadUnseenInstances() ([](*Instance), error) { + condition := `last_seen < last_checked` + return readInstancesByCondition(condition, sqlutils.Args(), "") +} + +// ReadProblemInstances reads all instances with problems +func ReadProblemInstances(clusterName string) ([](*Instance), error) { + condition := ` + cluster_name LIKE (CASE WHEN ? = '' THEN '%' ELSE ? END) + and ( + (last_seen < last_checked) + or (unix_timestamp() - unix_timestamp(last_checked) > ?) + or (replication_sql_thread_state not in (-1 ,1)) + or (replication_io_thread_state not in (-1 ,1)) + or (abs(cast(seconds_behind_master as signed) - cast(sql_delay as signed)) > ?) + or (abs(cast(slave_lag_seconds as signed) - cast(sql_delay as signed)) > ?) + or (gtid_errant != '') + or (replication_group_name != '' and replication_group_member_state != 'ONLINE') + ) + ` + + args := sqlutils.Args(clusterName, clusterName, config.Config.InstancePollSeconds*5, config.Config.ReasonableReplicationLagSeconds, config.Config.ReasonableReplicationLagSeconds) + instances, err := readInstancesByCondition(condition, args, "") + if err != nil { + return instances, err + } + var reportedInstances [](*Instance) + for _, instance := range instances { + skip := false + if instance.IsDowntimed { + skip = true + } + if RegexpMatchPatterns(instance.Key.StringCode(), config.Config.ProblemIgnoreHostnameFilters) { + skip = true + } + if !skip { + reportedInstances = append(reportedInstances, instance) + } + } + return reportedInstances, nil +} + +// SearchInstances reads all instances qualifying for some searchString +func SearchInstances(searchString string) ([](*Instance), error) { + searchString = strings.TrimSpace(searchString) + condition := ` + instr(hostname, ?) > 0 + or instr(cluster_name, ?) > 0 + or instr(version, ?) > 0 + or instr(version_comment, ?) > 0 + or instr(concat(hostname, ':', port), ?) > 0 + or instr(suggested_cluster_alias, ?) > 0 + or concat(server_id, '') = ? + or concat(port, '') = ? + ` + args := sqlutils.Args(searchString, searchString, searchString, searchString, searchString, searchString, searchString, searchString) + return readInstancesByCondition(condition, args, `replication_depth asc, num_slave_hosts desc, cluster_name, hostname, port`) +} + +// FindInstances reads all instances whose name matches given pattern +func FindInstances(regexpPattern string) (result [](*Instance), err error) { + result = [](*Instance){} + r, err := regexp.Compile(regexpPattern) + if err != nil { + return result, err + } + condition := `1=1` + unfiltered, err := readInstancesByCondition(condition, sqlutils.Args(), `replication_depth asc, num_slave_hosts desc, cluster_name, hostname, port`) + if err != nil { + return unfiltered, err + } + for _, instance := range unfiltered { + if r.MatchString(instance.Key.DisplayString()) { + result = append(result, instance) + } + } + return result, nil +} + +// findFuzzyInstances return instances whose names are like the one given (host & port substrings) +// For example, the given `mydb-3:3306` might find `myhosts-mydb301-production.mycompany.com:3306` +func findFuzzyInstances(fuzzyInstanceKey *InstanceKey) ([](*Instance), error) { + condition := ` + hostname like concat('%%', ?, '%%') + and port = ? + ` + return readInstancesByCondition(condition, sqlutils.Args(fuzzyInstanceKey.Hostname, fuzzyInstanceKey.Port), `replication_depth asc, num_slave_hosts desc, cluster_name, hostname, port`) +} + +// ReadFuzzyInstanceKey accepts a fuzzy instance key and expects to return a single, fully qualified, +// known instance key. +func ReadFuzzyInstanceKey(fuzzyInstanceKey *InstanceKey) *InstanceKey { + if fuzzyInstanceKey == nil { + return nil + } + if fuzzyInstanceKey.IsIPv4() { + // avoid fuzziness. When looking for 10.0.0.1 we don't want to match 10.0.0.15! + return nil + } + if fuzzyInstanceKey.Hostname != "" { + // Fuzzy instance search + if fuzzyInstances, _ := findFuzzyInstances(fuzzyInstanceKey); len(fuzzyInstances) == 1 { + return &(fuzzyInstances[0].Key) + } + } + return nil +} + +// ReadFuzzyInstanceKeyIfPossible accepts a fuzzy instance key and hopes to return a single, fully qualified, +// known instance key, or else the original given key +func ReadFuzzyInstanceKeyIfPossible(fuzzyInstanceKey *InstanceKey) *InstanceKey { + if instanceKey := ReadFuzzyInstanceKey(fuzzyInstanceKey); instanceKey != nil { + return instanceKey + } + return fuzzyInstanceKey +} + +// ReadFuzzyInstance accepts a fuzzy instance key and expects to return a single instance. +// Multiple instances matching the fuzzy keys are not allowed. +func ReadFuzzyInstance(fuzzyInstanceKey *InstanceKey) (*Instance, error) { + if fuzzyInstanceKey == nil { + return nil, log.Errorf("ReadFuzzyInstance received nil input") + } + if fuzzyInstanceKey.IsIPv4() { + // avoid fuzziness. When looking for 10.0.0.1 we don't want to match 10.0.0.15! + instance, _, err := ReadInstance(fuzzyInstanceKey) + return instance, err + } + if fuzzyInstanceKey.Hostname != "" { + // Fuzzy instance search + if fuzzyInstances, _ := findFuzzyInstances(fuzzyInstanceKey); len(fuzzyInstances) == 1 { + return fuzzyInstances[0], nil + } + } + return nil, log.Errorf("Cannot determine fuzzy instance %+v", *fuzzyInstanceKey) +} + +// ReadLostInRecoveryInstances returns all instances (potentially filtered by cluster) +// which are currently indicated as downtimed due to being lost during a topology recovery. +func ReadLostInRecoveryInstances(clusterName string) ([](*Instance), error) { + condition := ` + ifnull( + database_instance_downtime.downtime_active = 1 + and database_instance_downtime.end_timestamp > now() + and database_instance_downtime.reason = ?, 0) + and ? IN ('', cluster_name) + ` + return readInstancesByCondition(condition, sqlutils.Args(DowntimeLostInRecoveryMessage, clusterName), "cluster_name asc, replication_depth asc") +} + +// ReadDowntimedInstances returns all instances currently downtimed, potentially filtered by cluster +func ReadDowntimedInstances(clusterName string) ([](*Instance), error) { + condition := ` + ifnull( + database_instance_downtime.downtime_active = 1 + and database_instance_downtime.end_timestamp > now() + , 0) + and ? IN ('', cluster_name) + ` + return readInstancesByCondition(condition, sqlutils.Args(clusterName), "cluster_name asc, replication_depth asc") +} + +// ReadClusterCandidateInstances reads cluster instances which are also marked as candidates +func ReadClusterCandidateInstances(clusterName string) ([](*Instance), error) { + condition := ` + cluster_name = ? + and concat(hostname, ':', port) in ( + select concat(hostname, ':', port) + from candidate_database_instance + where promotion_rule in ('must', 'prefer') + ) + ` + return readInstancesByCondition(condition, sqlutils.Args(clusterName), "") +} + +// ReadClusterNeutralPromotionRuleInstances reads cluster instances whose promotion-rule is marked as 'neutral' +func ReadClusterNeutralPromotionRuleInstances(clusterName string) (neutralInstances [](*Instance), err error) { + instances, err := ReadClusterInstances(clusterName) + if err != nil { + return neutralInstances, err + } + for _, instance := range instances { + if instance.PromotionRule == NeutralPromoteRule { + neutralInstances = append(neutralInstances, instance) + } + } + return neutralInstances, nil +} + +// filterOSCInstances will filter the given list such that only replicas fit for OSC control remain. +func filterOSCInstances(instances [](*Instance)) [](*Instance) { + result := [](*Instance){} + for _, instance := range instances { + if RegexpMatchPatterns(instance.Key.StringCode(), config.Config.OSCIgnoreHostnameFilters) { + continue + } + if instance.IsBinlogServer() { + continue + } + if !instance.IsLastCheckValid { + continue + } + result = append(result, instance) + } + return result +} + +// GetClusterOSCReplicas returns a heuristic list of replicas which are fit as controll replicas for an OSC operation. +// These would be intermediate masters +func GetClusterOSCReplicas(clusterName string) ([](*Instance), error) { + intermediateMasters := [](*Instance){} + result := [](*Instance){} + var err error + if strings.Index(clusterName, "'") >= 0 { + return [](*Instance){}, log.Errorf("Invalid cluster name: %s", clusterName) + } + { + // Pick up to two busiest IMs + condition := ` + replication_depth = 1 + and num_slave_hosts > 0 + and cluster_name = ? + ` + intermediateMasters, err = readInstancesByCondition(condition, sqlutils.Args(clusterName), "") + if err != nil { + return result, err + } + sort.Sort(sort.Reverse(InstancesByCountReplicas(intermediateMasters))) + intermediateMasters = filterOSCInstances(intermediateMasters) + intermediateMasters = intermediateMasters[0:math.MinInt(2, len(intermediateMasters))] + result = append(result, intermediateMasters...) + } + { + // Get 2 replicas of found IMs, if possible + if len(intermediateMasters) == 1 { + // Pick 2 replicas for this IM + replicas, err := ReadReplicaInstances(&(intermediateMasters[0].Key)) + if err != nil { + return result, err + } + sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas))) + replicas = filterOSCInstances(replicas) + replicas = replicas[0:math.MinInt(2, len(replicas))] + result = append(result, replicas...) + + } + if len(intermediateMasters) == 2 { + // Pick one replica from each IM (should be possible) + for _, im := range intermediateMasters { + replicas, err := ReadReplicaInstances(&im.Key) + if err != nil { + return result, err + } + sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas))) + replicas = filterOSCInstances(replicas) + if len(replicas) > 0 { + result = append(result, replicas[0]) + } + } + } + } + { + // Get 2 3rd tier replicas, if possible + condition := ` + replication_depth = 3 + and cluster_name = ? + ` + replicas, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "") + if err != nil { + return result, err + } + sort.Sort(sort.Reverse(InstancesByCountReplicas(replicas))) + replicas = filterOSCInstances(replicas) + replicas = replicas[0:math.MinInt(2, len(replicas))] + result = append(result, replicas...) + } + { + // Get 2 1st tier leaf replicas, if possible + condition := ` + replication_depth = 1 + and num_slave_hosts = 0 + and cluster_name = ? + ` + replicas, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "") + if err != nil { + return result, err + } + replicas = filterOSCInstances(replicas) + replicas = replicas[0:math.MinInt(2, len(replicas))] + result = append(result, replicas...) + } + + return result, nil +} + +// GetClusterGhostReplicas returns a list of replicas that can serve as the connected servers +// for a [gh-ost](https://github.com/github/gh-ost) operation. A gh-ost operation prefers to talk +// to a RBR replica that has no children. +func GetClusterGhostReplicas(clusterName string) (result [](*Instance), err error) { + condition := ` + replication_depth > 0 + and binlog_format = 'ROW' + and cluster_name = ? + ` + instances, err := readInstancesByCondition(condition, sqlutils.Args(clusterName), "num_slave_hosts asc") + if err != nil { + return result, err + } + + for _, instance := range instances { + skipThisHost := false + if instance.IsBinlogServer() { + skipThisHost = true + } + if !instance.IsLastCheckValid { + skipThisHost = true + } + if !instance.LogBinEnabled { + skipThisHost = true + } + if !instance.LogReplicationUpdatesEnabled { + skipThisHost = true + } + if !skipThisHost { + result = append(result, instance) + } + } + + return result, err +} + +// GetInstancesMaxLag returns the maximum lag in a set of instances +func GetInstancesMaxLag(instances [](*Instance)) (maxLag int64, err error) { + if len(instances) == 0 { + return 0, log.Errorf("No instances found in GetInstancesMaxLag") + } + for _, clusterInstance := range instances { + if clusterInstance.ReplicationLagSeconds.Valid && clusterInstance.ReplicationLagSeconds.Int64 > maxLag { + maxLag = clusterInstance.ReplicationLagSeconds.Int64 + } + } + return maxLag, nil +} + +// GetClusterHeuristicLag returns a heuristic lag for a cluster, based on its OSC replicas +func GetClusterHeuristicLag(clusterName string) (int64, error) { + instances, err := GetClusterOSCReplicas(clusterName) + if err != nil { + return 0, err + } + return GetInstancesMaxLag(instances) +} + +// GetHeuristicClusterPoolInstances returns instances of a cluster which are also pooled. If `pool` argument +// is empty, all pools are considered, otherwise, only instances of given pool are considered. +func GetHeuristicClusterPoolInstances(clusterName string, pool string) (result [](*Instance), err error) { + result = [](*Instance){} + instances, err := ReadClusterInstances(clusterName) + if err != nil { + return result, err + } + + pooledInstanceKeys := NewInstanceKeyMap() + clusterPoolInstances, err := ReadClusterPoolInstances(clusterName, pool) + if err != nil { + return result, err + } + for _, clusterPoolInstance := range clusterPoolInstances { + pooledInstanceKeys.AddKey(InstanceKey{Hostname: clusterPoolInstance.Hostname, Port: clusterPoolInstance.Port}) + } + + for _, instance := range instances { + skipThisHost := false + if instance.IsBinlogServer() { + skipThisHost = true + } + if !instance.IsLastCheckValid { + skipThisHost = true + } + if !pooledInstanceKeys.HasKey(instance.Key) { + skipThisHost = true + } + if !skipThisHost { + result = append(result, instance) + } + } + + return result, err +} + +// GetHeuristicClusterPoolInstancesLag returns a heuristic lag for the instances participating +// in a cluster pool (or all the cluster's pools) +func GetHeuristicClusterPoolInstancesLag(clusterName string, pool string) (int64, error) { + instances, err := GetHeuristicClusterPoolInstances(clusterName, pool) + if err != nil { + return 0, err + } + return GetInstancesMaxLag(instances) +} + +// updateInstanceClusterName +func updateInstanceClusterName(instance *Instance) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update + database_instance + set + cluster_name=? + where + hostname=? and port=? + `, instance.ClusterName, instance.Key.Hostname, instance.Key.Port, + ) + if err != nil { + return log.Errore(err) + } + AuditOperation("update-cluster-name", &instance.Key, fmt.Sprintf("set to %s", instance.ClusterName)) + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// ReplaceClusterName replaces all occurances of oldClusterName with newClusterName +// It is called after a master failover +func ReplaceClusterName(oldClusterName string, newClusterName string) error { + if oldClusterName == "" { + return log.Errorf("replaceClusterName: skipping empty oldClusterName") + } + if newClusterName == "" { + return log.Errorf("replaceClusterName: skipping empty newClusterName") + } + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update + database_instance + set + cluster_name=? + where + cluster_name=? + `, newClusterName, oldClusterName, + ) + if err != nil { + return log.Errore(err) + } + AuditOperation("replace-cluster-name", nil, fmt.Sprintf("replaxced %s with %s", oldClusterName, newClusterName)) + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// ReviewUnseenInstances reviews instances that have not been seen (suposedly dead) and updates some of their data +func ReviewUnseenInstances() error { + instances, err := ReadUnseenInstances() + if err != nil { + return log.Errore(err) + } + operations := 0 + for _, instance := range instances { + instance := instance + + masterHostname, err := ResolveHostname(instance.MasterKey.Hostname) + if err != nil { + log.Errore(err) + continue + } + instance.MasterKey.Hostname = masterHostname + savedClusterName := instance.ClusterName + + if err := ReadInstanceClusterAttributes(instance); err != nil { + log.Errore(err) + } else if instance.ClusterName != savedClusterName { + updateInstanceClusterName(instance) + operations++ + } + } + + AuditOperation("review-unseen-instances", nil, fmt.Sprintf("Operations: %d", operations)) + return err +} + +// readUnseenMasterKeys will read list of masters that have never been seen, and yet whose replicas +// seem to be replicating. +func readUnseenMasterKeys() ([]InstanceKey, error) { + res := []InstanceKey{} + + err := db.QueryOrchestratorRowsMap(` + SELECT DISTINCT + slave_instance.master_host, slave_instance.master_port + FROM + database_instance slave_instance + LEFT JOIN + hostname_resolve ON (slave_instance.master_host = hostname_resolve.hostname) + LEFT JOIN + database_instance master_instance ON ( + COALESCE(hostname_resolve.resolved_hostname, slave_instance.master_host) = master_instance.hostname + and slave_instance.master_port = master_instance.port) + WHERE + master_instance.last_checked IS NULL + and slave_instance.master_host != '' + and slave_instance.master_host != '_' + and slave_instance.master_port > 0 + and slave_instance.slave_io_running = 1 + `, func(m sqlutils.RowMap) error { + instanceKey, _ := NewResolveInstanceKey(m.GetString("master_host"), m.GetInt("master_port")) + // we ignore the error. It can be expected that we are unable to resolve the hostname. + // Maybe that's how we got here in the first place! + res = append(res, *instanceKey) + + return nil + }) + if err != nil { + return res, log.Errore(err) + } + + return res, nil +} + +// InjectSeed: intented to be used to inject an instance upon startup, assuming it's not already known to orchestrator. +func InjectSeed(instanceKey *InstanceKey) error { + if instanceKey == nil { + return fmt.Errorf("InjectSeed: nil instanceKey") + } + clusterName := instanceKey.StringCode() + // minimal details: + instance := &Instance{Key: *instanceKey, Version: "Unknown", ClusterName: clusterName} + instance.SetSeed() + err := WriteInstance(instance, false, nil) + log.Debugf("InjectSeed: %+v, %+v", *instanceKey, err) + AuditOperation("inject-seed", instanceKey, "injected") + return err +} + +// InjectUnseenMasters will review masters of instances that are known to be replicating, yet which are not listed +// in database_instance. Since their replicas are listed as replicating, we can assume that such masters actually do +// exist: we shall therefore inject them with minimal details into the database_instance table. +func InjectUnseenMasters() error { + + unseenMasterKeys, err := readUnseenMasterKeys() + if err != nil { + return err + } + + operations := 0 + for _, masterKey := range unseenMasterKeys { + masterKey := masterKey + + if RegexpMatchPatterns(masterKey.StringCode(), config.Config.DiscoveryIgnoreMasterHostnameFilters) { + log.Debugf("InjectUnseenMasters: skipping discovery of %+v because it matches DiscoveryIgnoreMasterHostnameFilters", masterKey) + continue + } + if RegexpMatchPatterns(masterKey.StringCode(), config.Config.DiscoveryIgnoreHostnameFilters) { + log.Debugf("InjectUnseenMasters: skipping discovery of %+v because it matches DiscoveryIgnoreHostnameFilters", masterKey) + continue + } + + clusterName := masterKey.StringCode() + // minimal details: + instance := Instance{Key: masterKey, Version: "Unknown", ClusterName: clusterName} + if err := WriteInstance(&instance, false, nil); err == nil { + operations++ + } + } + + AuditOperation("inject-unseen-masters", nil, fmt.Sprintf("Operations: %d", operations)) + return err +} + +// ForgetUnseenInstancesDifferentlyResolved will purge instances which are invalid, and whose hostname +// appears on the hostname_resolved table; this means some time in the past their hostname was unresovled, and now +// resovled to a different value; the old hostname is never accessed anymore and the old entry should be removed. +func ForgetUnseenInstancesDifferentlyResolved() error { + query := ` + select + database_instance.hostname, database_instance.port + from + hostname_resolve + JOIN database_instance ON (hostname_resolve.hostname = database_instance.hostname) + where + hostname_resolve.hostname != hostname_resolve.resolved_hostname + AND ifnull(last_checked <= last_seen, 0) = 0 + ` + keys := NewInstanceKeyMap() + err := db.QueryOrchestrator(query, nil, func(m sqlutils.RowMap) error { + key := InstanceKey{ + Hostname: m.GetString("hostname"), + Port: m.GetInt("port"), + } + keys.AddKey(key) + return nil + }) + var rowsAffected int64 = 0 + for _, key := range keys.GetInstanceKeys() { + sqlResult, err := db.ExecOrchestrator(` + delete from + database_instance + where + hostname = ? and port = ? + `, key.Hostname, key.Port, + ) + if err != nil { + return log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return log.Errore(err) + } + rowsAffected = rowsAffected + rows + } + AuditOperation("forget-unseen-differently-resolved", nil, fmt.Sprintf("Forgotten instances: %d", rowsAffected)) + return err +} + +// readUnknownMasterHostnameResolves will figure out the resolved hostnames of master-hosts which cannot be found. +// It uses the hostname_resolve_history table to heuristically guess the correct hostname (based on "this was the +// last time we saw this hostname and it resolves into THAT") +func readUnknownMasterHostnameResolves() (map[string]string, error) { + res := make(map[string]string) + err := db.QueryOrchestratorRowsMap(` + SELECT DISTINCT + slave_instance.master_host, hostname_resolve_history.resolved_hostname + FROM + database_instance slave_instance + LEFT JOIN hostname_resolve ON (slave_instance.master_host = hostname_resolve.hostname) + LEFT JOIN database_instance master_instance ON ( + COALESCE(hostname_resolve.resolved_hostname, slave_instance.master_host) = master_instance.hostname + and slave_instance.master_port = master_instance.port + ) LEFT JOIN hostname_resolve_history ON (slave_instance.master_host = hostname_resolve_history.hostname) + WHERE + master_instance.last_checked IS NULL + and slave_instance.master_host != '' + and slave_instance.master_host != '_' + and slave_instance.master_port > 0 + `, func(m sqlutils.RowMap) error { + res[m.GetString("master_host")] = m.GetString("resolved_hostname") + return nil + }) + if err != nil { + return res, log.Errore(err) + } + + return res, nil +} + +// ResolveUnknownMasterHostnameResolves fixes missing hostname resolves based on hostname_resolve_history +// The use case is replicas replicating from some unknown-hostname which cannot be otherwise found. This could +// happen due to an expire unresolve together with clearing up of hostname cache. +func ResolveUnknownMasterHostnameResolves() error { + + hostnameResolves, err := readUnknownMasterHostnameResolves() + if err != nil { + return err + } + for hostname, resolvedHostname := range hostnameResolves { + UpdateResolvedHostname(hostname, resolvedHostname) + } + + AuditOperation("resolve-unknown-masters", nil, fmt.Sprintf("Num resolved hostnames: %d", len(hostnameResolves))) + return err +} + +// ReadCountMySQLSnapshots is a utility method to return registered number of snapshots for a given list of hosts +func ReadCountMySQLSnapshots(hostnames []string) (map[string]int, error) { + res := make(map[string]int) + if !config.Config.ServeAgentsHttp { + return res, nil + } + query := fmt.Sprintf(` + select + hostname, + count_mysql_snapshots + from + host_agent + where + hostname in (%s) + order by + hostname + `, sqlutils.InClauseStringValues(hostnames)) + + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + res[m.GetString("hostname")] = m.GetInt("count_mysql_snapshots") + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// PopulateInstancesAgents will fill in extra data acquired from agents for given instances +// At current this is the number of snapshots. +// This isn't too pretty; it's a push-into-instance-data-that-belongs-to-agent thing. +// Originally the need was to visually present the number of snapshots per host on the web/cluster page, which +// indeed proves to be useful in our experience. +func PopulateInstancesAgents(instances [](*Instance)) error { + if len(instances) == 0 { + return nil + } + hostnames := []string{} + for _, instance := range instances { + hostnames = append(hostnames, instance.Key.Hostname) + } + agentsCountMySQLSnapshots, err := ReadCountMySQLSnapshots(hostnames) + if err != nil { + return err + } + for _, instance := range instances { + if count, ok := agentsCountMySQLSnapshots[instance.Key.Hostname]; ok { + instance.CountMySQLSnapshots = count + } + } + + return nil +} + +func GetClusterName(instanceKey *InstanceKey) (clusterName string, err error) { + if clusterName, found := instanceKeyInformativeClusterName.Get(instanceKey.StringCode()); found { + return clusterName.(string), nil + } + query := ` + select + ifnull(max(cluster_name), '') as cluster_name + from + database_instance + where + hostname = ? + and port = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(instanceKey.Hostname, instanceKey.Port), func(m sqlutils.RowMap) error { + clusterName = m.GetString("cluster_name") + instanceKeyInformativeClusterName.Set(instanceKey.StringCode(), clusterName, cache.DefaultExpiration) + return nil + }) + + return clusterName, log.Errore(err) +} + +// ReadClusters reads names of all known clusters +func ReadClusters() (clusterNames []string, err error) { + clusters, err := ReadClustersInfo("") + if err != nil { + return clusterNames, err + } + for _, clusterInfo := range clusters { + clusterNames = append(clusterNames, clusterInfo.ClusterName) + } + return clusterNames, nil +} + +// ReadClusterInfo reads some info about a given cluster +func ReadClusterInfo(clusterName string) (*ClusterInfo, error) { + clusters, err := ReadClustersInfo(clusterName) + if err != nil { + return &ClusterInfo{}, err + } + if len(clusters) != 1 { + return &ClusterInfo{}, fmt.Errorf("No cluster info found for %s", clusterName) + } + return &(clusters[0]), nil +} + +// ReadClustersInfo reads names of all known clusters and some aggregated info +func ReadClustersInfo(clusterName string) ([]ClusterInfo, error) { + clusters := []ClusterInfo{} + + whereClause := "" + args := sqlutils.Args() + if clusterName != "" { + whereClause = `where cluster_name = ?` + args = append(args, clusterName) + } + query := fmt.Sprintf(` + select + cluster_name, + count(*) as count_instances, + ifnull(min(alias), cluster_name) as alias, + ifnull(min(domain_name), '') as domain_name + from + database_instance + left join cluster_alias using (cluster_name) + left join cluster_domain_name using (cluster_name) + %s + group by + cluster_name`, whereClause) + + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + clusterInfo := ClusterInfo{ + ClusterName: m.GetString("cluster_name"), + CountInstances: m.GetUint("count_instances"), + ClusterAlias: m.GetString("alias"), + ClusterDomain: m.GetString("domain_name"), + } + clusterInfo.ApplyClusterAlias() + clusterInfo.ReadRecoveryInfo() + + clusters = append(clusters, clusterInfo) + return nil + }) + + return clusters, err +} + +// Get a listing of KVPair for clusters masters, for all clusters or for a specific cluster. +func GetMastersKVPairs(clusterName string) (kvPairs [](*kv.KVPair), err error) { + + clusterAliasMap := make(map[string]string) + if clustersInfo, err := ReadClustersInfo(clusterName); err != nil { + return kvPairs, err + } else { + for _, clusterInfo := range clustersInfo { + clusterAliasMap[clusterInfo.ClusterName] = clusterInfo.ClusterAlias + } + } + + masters, err := ReadWriteableClustersMasters() + if err != nil { + return kvPairs, err + } + for _, master := range masters { + clusterPairs := GetClusterMasterKVPairs(clusterAliasMap[master.ClusterName], &master.Key) + kvPairs = append(kvPairs, clusterPairs...) + } + + return kvPairs, err +} + +// HeuristicallyApplyClusterDomainInstanceAttribute writes down the cluster-domain +// to master-hostname as a general attribute, by reading current topology and **trusting** it to be correct +func HeuristicallyApplyClusterDomainInstanceAttribute(clusterName string) (instanceKey *InstanceKey, err error) { + clusterInfo, err := ReadClusterInfo(clusterName) + if err != nil { + return nil, err + } + + if clusterInfo.ClusterDomain == "" { + return nil, fmt.Errorf("Cannot find domain name for cluster %+v", clusterName) + } + + masters, err := ReadClusterWriteableMaster(clusterName) + if err != nil { + return nil, err + } + if len(masters) != 1 { + return nil, fmt.Errorf("Found %+v potential master for cluster %+v", len(masters), clusterName) + } + instanceKey = &masters[0].Key + return instanceKey, attributes.SetGeneralAttribute(clusterInfo.ClusterDomain, instanceKey.StringCode()) +} + +// GetHeuristicClusterDomainInstanceAttribute attempts detecting the cluster domain +// for the given cluster, and return the instance key associated as writer with that domain +func GetHeuristicClusterDomainInstanceAttribute(clusterName string) (instanceKey *InstanceKey, err error) { + clusterInfo, err := ReadClusterInfo(clusterName) + if err != nil { + return nil, err + } + + if clusterInfo.ClusterDomain == "" { + return nil, fmt.Errorf("Cannot find domain name for cluster %+v", clusterName) + } + + writerInstanceName, err := attributes.GetGeneralAttribute(clusterInfo.ClusterDomain) + if err != nil { + return nil, err + } + return ParseRawInstanceKey(writerInstanceName) +} + +// ReadAllInstanceKeys +func ReadAllInstanceKeys() ([]InstanceKey, error) { + res := []InstanceKey{} + query := ` + select + hostname, port + from + database_instance + ` + err := db.QueryOrchestrator(query, sqlutils.Args(), func(m sqlutils.RowMap) error { + instanceKey, merr := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) + if merr != nil { + log.Errore(merr) + } else if !InstanceIsForgotten(instanceKey) { + // only if not in "forget" cache + res = append(res, *instanceKey) + } + return nil + }) + return res, log.Errore(err) +} + +// ReadAllInstanceKeysMasterKeys +func ReadAllMinimalInstances() ([]MinimalInstance, error) { + res := []MinimalInstance{} + query := ` + select + hostname, port, master_host, master_port, cluster_name + from + database_instance + ` + err := db.QueryOrchestrator(query, sqlutils.Args(), func(m sqlutils.RowMap) error { + minimalInstance := MinimalInstance{} + minimalInstance.Key = InstanceKey{ + Hostname: m.GetString("hostname"), + Port: m.GetInt("port"), + } + minimalInstance.MasterKey = InstanceKey{ + Hostname: m.GetString("master_host"), + Port: m.GetInt("master_port"), + } + minimalInstance.ClusterName = m.GetString("cluster_name") + + if !InstanceIsForgotten(&minimalInstance.Key) { + // only if not in "forget" cache + res = append(res, minimalInstance) + } + return nil + }) + return res, log.Errore(err) +} + +// ReadOutdatedInstanceKeys reads and returns keys for all instances that are not up to date (i.e. +// pre-configured time has passed since they were last checked) +// But we also check for the case where an attempt at instance checking has been made, that hasn't +// resulted in an actual check! This can happen when TCP/IP connections are hung, in which case the "check" +// never returns. In such case we multiply interval by a factor, so as not to open too many connections on +// the instance. +func ReadOutdatedInstanceKeys() ([]InstanceKey, error) { + res := []InstanceKey{} + query := ` + select + hostname, port + from + database_instance + where + case + when last_attempted_check <= last_checked + then last_checked < now() - interval ? second + else last_checked < now() - interval ? second + end + ` + args := sqlutils.Args(config.Config.InstancePollSeconds, 2*config.Config.InstancePollSeconds) + + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + instanceKey, merr := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) + if merr != nil { + log.Errore(merr) + } else if !InstanceIsForgotten(instanceKey) { + // only if not in "forget" cache + res = append(res, *instanceKey) + } + // We don;t return an error because we want to keep filling the outdated instances list. + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err + +} + +func mkInsertOdku(table string, columns []string, values []string, nrRows int, insertIgnore bool) (string, error) { + if len(columns) == 0 { + return "", errors.New("Column list cannot be empty") + } + if nrRows < 1 { + return "", errors.New("nrRows must be a positive number") + } + if len(columns) != len(values) { + return "", errors.New("number of values must be equal to number of columns") + } + + var q bytes.Buffer + var ignore string = "" + if insertIgnore { + ignore = "ignore" + } + var valRow string = fmt.Sprintf("(%s)", strings.Join(values, ", ")) + var val bytes.Buffer + val.WriteString(valRow) + for i := 1; i < nrRows; i++ { + val.WriteString(",\n ") // indent VALUES, see below + val.WriteString(valRow) + } + + var col string = strings.Join(columns, ", ") + var odku bytes.Buffer + odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", columns[0], columns[0])) + for _, c := range columns[1:] { + odku.WriteString(", ") + odku.WriteString(fmt.Sprintf("%s=VALUES(%s)", c, c)) + } + + q.WriteString(fmt.Sprintf(`INSERT %s INTO %s + (%s) + VALUES + %s + ON DUPLICATE KEY UPDATE + %s + `, + ignore, table, col, val.String(), odku.String())) + + return q.String(), nil +} + +func mkInsertOdkuForInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) (string, []interface{}, error) { + if len(instances) == 0 { + return "", nil, nil + } + + insertIgnore := false + if !instanceWasActuallyFound { + insertIgnore = true + } + var columns = []string{ + "hostname", + "port", + "last_checked", + "last_attempted_check", + "last_check_partial_success", + "uptime", + "server_id", + "server_uuid", + "version", + "major_version", + "version_comment", + "binlog_server", + "read_only", + "binlog_format", + "binlog_row_image", + "log_bin", + "log_slave_updates", + "binary_log_file", + "binary_log_pos", + "master_host", + "master_port", + "slave_sql_running", + "slave_io_running", + "replication_sql_thread_state", + "replication_io_thread_state", + "has_replication_filters", + "supports_oracle_gtid", + "oracle_gtid", + "master_uuid", + "ancestry_uuid", + "executed_gtid_set", + "gtid_mode", + "gtid_purged", + "gtid_errant", + "mariadb_gtid", + "pseudo_gtid", + "master_log_file", + "read_master_log_pos", + "relay_master_log_file", + "exec_master_log_pos", + "relay_log_file", + "relay_log_pos", + "last_sql_error", + "last_io_error", + "seconds_behind_master", + "slave_lag_seconds", + "sql_delay", + "num_slave_hosts", + "slave_hosts", + "cluster_name", + "suggested_cluster_alias", + "data_center", + "region", + "physical_environment", + "replication_depth", + "is_co_master", + "replication_credentials_available", + "has_replication_credentials", + "allow_tls", + "semi_sync_enforced", + "semi_sync_available", + "semi_sync_master_enabled", + "semi_sync_master_timeout", + "semi_sync_master_wait_for_slave_count", + "semi_sync_replica_enabled", + "semi_sync_master_status", + "semi_sync_master_clients", + "semi_sync_replica_status", + "instance_alias", + "last_discovery_latency", + "replication_group_name", + "replication_group_is_single_primary_mode", + "replication_group_member_state", + "replication_group_member_role", + "replication_group_members", + "replication_group_primary_host", + "replication_group_primary_port", + } + + var values []string = make([]string, len(columns), len(columns)) + for i := range columns { + values[i] = "?" + } + values[2] = "NOW()" // last_checked + values[3] = "NOW()" // last_attempted_check + values[4] = "1" // last_check_partial_success + + if updateLastSeen { + columns = append(columns, "last_seen") + values = append(values, "NOW()") + } + + var args []interface{} + for _, instance := range instances { + // number of columns minus 2 as last_checked and last_attempted_check + // updated with NOW() + args = append(args, instance.Key.Hostname) + args = append(args, instance.Key.Port) + args = append(args, instance.Uptime) + args = append(args, instance.ServerID) + args = append(args, instance.ServerUUID) + args = append(args, instance.Version) + args = append(args, instance.MajorVersionString()) + args = append(args, instance.VersionComment) + args = append(args, instance.IsBinlogServer()) + args = append(args, instance.ReadOnly) + args = append(args, instance.Binlog_format) + args = append(args, instance.BinlogRowImage) + args = append(args, instance.LogBinEnabled) + args = append(args, instance.LogReplicationUpdatesEnabled) + args = append(args, instance.SelfBinlogCoordinates.LogFile) + args = append(args, instance.SelfBinlogCoordinates.LogPos) + args = append(args, instance.MasterKey.Hostname) + args = append(args, instance.MasterKey.Port) + args = append(args, instance.ReplicationSQLThreadRuning) + args = append(args, instance.ReplicationIOThreadRuning) + args = append(args, instance.ReplicationSQLThreadState) + args = append(args, instance.ReplicationIOThreadState) + args = append(args, instance.HasReplicationFilters) + args = append(args, instance.SupportsOracleGTID) + args = append(args, instance.UsingOracleGTID) + args = append(args, instance.MasterUUID) + args = append(args, instance.AncestryUUID) + args = append(args, instance.ExecutedGtidSet) + args = append(args, instance.GTIDMode) + args = append(args, instance.GtidPurged) + args = append(args, instance.GtidErrant) + args = append(args, instance.UsingMariaDBGTID) + args = append(args, instance.UsingPseudoGTID) + args = append(args, instance.ReadBinlogCoordinates.LogFile) + args = append(args, instance.ReadBinlogCoordinates.LogPos) + args = append(args, instance.ExecBinlogCoordinates.LogFile) + args = append(args, instance.ExecBinlogCoordinates.LogPos) + args = append(args, instance.RelaylogCoordinates.LogFile) + args = append(args, instance.RelaylogCoordinates.LogPos) + args = append(args, instance.LastSQLError) + args = append(args, instance.LastIOError) + args = append(args, instance.SecondsBehindMaster) + args = append(args, instance.ReplicationLagSeconds) + args = append(args, instance.SQLDelay) + args = append(args, len(instance.Replicas)) + args = append(args, instance.Replicas.ToJSONString()) + args = append(args, instance.ClusterName) + args = append(args, instance.SuggestedClusterAlias) + args = append(args, instance.DataCenter) + args = append(args, instance.Region) + args = append(args, instance.PhysicalEnvironment) + args = append(args, instance.ReplicationDepth) + args = append(args, instance.IsCoMaster) + args = append(args, instance.ReplicationCredentialsAvailable) + args = append(args, instance.HasReplicationCredentials) + args = append(args, instance.AllowTLS) + args = append(args, instance.SemiSyncEnforced) + args = append(args, instance.SemiSyncAvailable) + args = append(args, instance.SemiSyncMasterEnabled) + args = append(args, instance.SemiSyncMasterTimeout) + args = append(args, instance.SemiSyncMasterWaitForReplicaCount) + args = append(args, instance.SemiSyncReplicaEnabled) + args = append(args, instance.SemiSyncMasterStatus) + args = append(args, instance.SemiSyncMasterClients) + args = append(args, instance.SemiSyncReplicaStatus) + args = append(args, instance.InstanceAlias) + args = append(args, instance.LastDiscoveryLatency.Nanoseconds()) + args = append(args, instance.ReplicationGroupName) + args = append(args, instance.ReplicationGroupIsSinglePrimary) + args = append(args, instance.ReplicationGroupMemberState) + args = append(args, instance.ReplicationGroupMemberRole) + args = append(args, instance.ReplicationGroupMembers.ToJSONString()) + args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Hostname) + args = append(args, instance.ReplicationGroupPrimaryInstanceKey.Port) + } + + sql, err := mkInsertOdku("database_instance", columns, values, len(instances), insertIgnore) + if err != nil { + return sql, args, log.Errorf("Failed to build query: %v", err) + } + + return sql, args, nil +} + +// writeManyInstances stores instances in the orchestrator backend +func writeManyInstances(instances []*Instance, instanceWasActuallyFound bool, updateLastSeen bool) error { + writeInstances := [](*Instance){} + for _, instance := range instances { + if InstanceIsForgotten(&instance.Key) && !instance.IsSeed() { + continue + } + writeInstances = append(writeInstances, instance) + } + if len(writeInstances) == 0 { + return nil // nothing to write + } + sql, args, err := mkInsertOdkuForInstances(writeInstances, instanceWasActuallyFound, updateLastSeen) + if err != nil { + return err + } + if _, err := db.ExecOrchestrator(sql, args...); err != nil { + return err + } + return nil +} + +type instanceUpdateObject struct { + instance *Instance + instanceWasActuallyFound bool + lastError error +} + +// instances sorter by instanceKey +type byInstanceKey []*Instance + +func (a byInstanceKey) Len() int { return len(a) } +func (a byInstanceKey) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a byInstanceKey) Less(i, j int) bool { return a[i].Key.SmallerThan(&a[j].Key) } + +var instanceWriteBuffer chan instanceUpdateObject +var forceFlushInstanceWriteBuffer = make(chan bool) + +func enqueueInstanceWrite(instance *Instance, instanceWasActuallyFound bool, lastError error) { + if len(instanceWriteBuffer) == config.Config.InstanceWriteBufferSize { + // Signal the "flushing" goroutine that there's work. + // We prefer doing all bulk flushes from one goroutine. + // Non blocking send to avoid blocking goroutines on sending a flush, + // if the "flushing" goroutine is not able read is because a flushing is ongoing. + select { + case forceFlushInstanceWriteBuffer <- true: + default: + } + } + instanceWriteBuffer <- instanceUpdateObject{instance, instanceWasActuallyFound, lastError} +} + +// flushInstanceWriteBuffer saves enqueued instances to Orchestrator Db +func flushInstanceWriteBuffer() { + var instances []*Instance + var lastseen []*Instance // instances to update with last_seen field + + defer func() { + // reset stopwatches (TODO: .ResetAll()) + writeBufferLatency.Reset("wait") + writeBufferLatency.Reset("write") + writeBufferLatency.Start("wait") // waiting for next flush + }() + + writeBufferLatency.Stop("wait") + + if len(instanceWriteBuffer) == 0 { + return + } + + // There are `DiscoveryMaxConcurrency` many goroutines trying to enqueue an instance into the buffer + // when one instance is flushed from the buffer then one discovery goroutine is ready to enqueue a new instance + // this is why we want to flush all instances in the buffer untill a max of `InstanceWriteBufferSize`. + // Otherwise we can flush way more instances than what's expected. + for i := 0; i < config.Config.InstanceWriteBufferSize && len(instanceWriteBuffer) > 0; i++ { + upd := <-instanceWriteBuffer + if upd.instanceWasActuallyFound && upd.lastError == nil { + lastseen = append(lastseen, upd.instance) + } else { + instances = append(instances, upd.instance) + log.Debugf("flushInstanceWriteBuffer: will not update database_instance.last_seen due to error: %+v", upd.lastError) + } + } + + writeBufferLatency.Start("write") + + // sort instances by instanceKey (table pk) to make locking predictable + sort.Sort(byInstanceKey(instances)) + sort.Sort(byInstanceKey(lastseen)) + + writeFunc := func() error { + err := writeManyInstances(instances, true, false) + if err != nil { + return log.Errorf("flushInstanceWriteBuffer writemany: %v", err) + } + err = writeManyInstances(lastseen, true, true) + if err != nil { + return log.Errorf("flushInstanceWriteBuffer last_seen: %v", err) + } + + writeInstanceCounter.Inc(int64(len(instances) + len(lastseen))) + return nil + } + err := ExecDBWriteFunc(writeFunc) + if err != nil { + log.Errorf("flushInstanceWriteBuffer: %v", err) + } + + writeBufferLatency.Stop("write") + + writeBufferMetrics.Append(&WriteBufferMetric{ + Timestamp: time.Now(), + WaitLatency: writeBufferLatency.Elapsed("wait"), + WriteLatency: writeBufferLatency.Elapsed("write"), + Instances: len(lastseen) + len(instances), + }) +} + +// WriteInstance stores an instance in the orchestrator backend +func WriteInstance(instance *Instance, instanceWasActuallyFound bool, lastError error) error { + if lastError != nil { + log.Debugf("writeInstance: will not update database_instance due to error: %+v", lastError) + return nil + } + return writeManyInstances([]*Instance{instance}, instanceWasActuallyFound, true) +} + +// UpdateInstanceLastChecked updates the last_check timestamp in the orchestrator backed database +// for a given instance +func UpdateInstanceLastChecked(instanceKey *InstanceKey, partialSuccess bool) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update + database_instance + set + last_checked = NOW(), + last_check_partial_success = ? + where + hostname = ? + and port = ?`, + partialSuccess, + instanceKey.Hostname, + instanceKey.Port, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// UpdateInstanceLastAttemptedCheck updates the last_attempted_check timestamp in the orchestrator backed database +// for a given instance. +// This is used as a failsafe mechanism in case access to the instance gets hung (it happens), in which case +// the entire ReadTopology gets stuck (and no, connection timeout nor driver timeouts don't help. Don't look at me, +// the world is a harsh place to live in). +// And so we make sure to note down *before* we even attempt to access the instance; and this raises a red flag when we +// wish to access the instance again: if last_attempted_check is *newer* than last_checked, that's bad news and means +// we have a "hanging" issue. +func UpdateInstanceLastAttemptedCheck(instanceKey *InstanceKey) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update + database_instance + set + last_attempted_check = NOW() + where + hostname = ? + and port = ?`, + instanceKey.Hostname, + instanceKey.Port, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +func InstanceIsForgotten(instanceKey *InstanceKey) bool { + _, found := forgetInstanceKeys.Get(instanceKey.StringCode()) + return found +} + +// ForgetInstance removes an instance entry from the orchestrator backed database. +// It may be auto-rediscovered through topology or requested for discovery by multiple means. +func ForgetInstance(instanceKey *InstanceKey) error { + if instanceKey == nil { + return log.Errorf("ForgetInstance(): nil instanceKey") + } + forgetInstanceKeys.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) + sqlResult, err := db.ExecOrchestrator(` + delete + from database_instance + where + hostname = ? and port = ?`, + instanceKey.Hostname, + instanceKey.Port, + ) + if err != nil { + return log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return log.Errore(err) + } + if rows == 0 { + return log.Errorf("ForgetInstance(): instance %+v not found", *instanceKey) + } + AuditOperation("forget", instanceKey, "") + return nil +} + +// ForgetInstance removes an instance entry from the orchestrator backed database. +// It may be auto-rediscovered through topology or requested for discovery by multiple means. +func ForgetCluster(clusterName string) error { + clusterInstances, err := ReadClusterInstances(clusterName) + if err != nil { + return err + } + if len(clusterInstances) == 0 { + return nil + } + for _, instance := range clusterInstances { + forgetInstanceKeys.Set(instance.Key.StringCode(), true, cache.DefaultExpiration) + AuditOperation("forget", &instance.Key, "") + } + _, err = db.ExecOrchestrator(` + delete + from database_instance + where + cluster_name = ?`, + clusterName, + ) + return err +} + +// ForgetLongUnseenInstances will remove entries of all instacnes that have long since been last seen. +func ForgetLongUnseenInstances() error { + sqlResult, err := db.ExecOrchestrator(` + delete + from database_instance + where + last_seen < NOW() - interval ? hour`, + config.Config.UnseenInstanceForgetHours, + ) + if err != nil { + return log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return log.Errore(err) + } + AuditOperation("forget-unseen", nil, fmt.Sprintf("Forgotten instances: %d", rows)) + return err +} + +// SnapshotTopologies records topology graph for all existing topologies +func SnapshotTopologies() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert ignore into + database_instance_topology_history (snapshot_unix_timestamp, + hostname, port, master_host, master_port, cluster_name, version) + select + UNIX_TIMESTAMP(NOW()), + hostname, port, master_host, master_port, cluster_name, version + from + database_instance + `, + ) + if err != nil { + return log.Errore(err) + } + + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// ReadHistoryClusterInstances reads (thin) instances from history +func ReadHistoryClusterInstances(clusterName string, historyTimestampPattern string) ([](*Instance), error) { + instances := [](*Instance){} + + query := ` + select + * + from + database_instance_topology_history + where + snapshot_unix_timestamp rlike ? + and cluster_name = ? + order by + hostname, port` + + err := db.QueryOrchestrator(query, sqlutils.Args(historyTimestampPattern, clusterName), func(m sqlutils.RowMap) error { + instance := NewInstance() + + instance.Key.Hostname = m.GetString("hostname") + instance.Key.Port = m.GetInt("port") + instance.MasterKey.Hostname = m.GetString("master_host") + instance.MasterKey.Port = m.GetInt("master_port") + instance.ClusterName = m.GetString("cluster_name") + + instances = append(instances, instance) + return nil + }) + if err != nil { + return instances, log.Errore(err) + } + return instances, err +} + +// RecordInstanceCoordinatesHistory snapshots the binlog coordinates of instances +func RecordInstanceCoordinatesHistory() error { + { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from database_instance_coordinates_history + where + recorded_timestamp < NOW() - INTERVAL ? MINUTE + `, (config.PseudoGTIDCoordinatesHistoryHeuristicMinutes + 2), + ) + return log.Errore(err) + } + ExecDBWriteFunc(writeFunc) + } + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into + database_instance_coordinates_history ( + hostname, port, last_seen, recorded_timestamp, + binary_log_file, binary_log_pos, relay_log_file, relay_log_pos + ) + select + hostname, port, last_seen, NOW(), + binary_log_file, binary_log_pos, relay_log_file, relay_log_pos + from + database_instance + where + ( + binary_log_file != '' + or relay_log_file != '' + ) + `, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// GetHeuristiclyRecentCoordinatesForInstance returns valid and reasonably recent coordinates for given instance. +func GetHeuristiclyRecentCoordinatesForInstance(instanceKey *InstanceKey) (selfCoordinates *BinlogCoordinates, relayLogCoordinates *BinlogCoordinates, err error) { + query := ` + select + binary_log_file, binary_log_pos, relay_log_file, relay_log_pos + from + database_instance_coordinates_history + where + hostname = ? + and port = ? + and recorded_timestamp <= NOW() - INTERVAL ? MINUTE + order by + recorded_timestamp desc + limit 1 + ` + err = db.QueryOrchestrator(query, sqlutils.Args(instanceKey.Hostname, instanceKey.Port, config.PseudoGTIDCoordinatesHistoryHeuristicMinutes), func(m sqlutils.RowMap) error { + selfCoordinates = &BinlogCoordinates{LogFile: m.GetString("binary_log_file"), LogPos: m.GetInt64("binary_log_pos")} + relayLogCoordinates = &BinlogCoordinates{LogFile: m.GetString("relay_log_file"), LogPos: m.GetInt64("relay_log_pos")} + + return nil + }) + return selfCoordinates, relayLogCoordinates, err +} + +// RecordInstanceCoordinatesHistory snapshots the binlog coordinates of instances +func RecordStaleInstanceBinlogCoordinates(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) error { + args := sqlutils.Args( + instanceKey.Hostname, instanceKey.Port, + binlogCoordinates.LogFile, binlogCoordinates.LogPos, + ) + _, err := db.ExecOrchestrator(` + delete from + database_instance_stale_binlog_coordinates + where + hostname=? and port=? + and ( + binary_log_file != ? + or binary_log_pos != ? + ) + `, + args..., + ) + if err != nil { + return log.Errore(err) + } + _, err = db.ExecOrchestrator(` + insert ignore into + database_instance_stale_binlog_coordinates ( + hostname, port, binary_log_file, binary_log_pos, first_seen + ) + values ( + ?, ?, ?, ?, NOW() + )`, + args...) + return log.Errore(err) +} + +func ExpireStaleInstanceBinlogCoordinates() error { + expireSeconds := config.Config.ReasonableReplicationLagSeconds * 2 + if expireSeconds < config.StaleInstanceCoordinatesExpireSeconds { + expireSeconds = config.StaleInstanceCoordinatesExpireSeconds + } + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from database_instance_stale_binlog_coordinates + where first_seen < NOW() - INTERVAL ? SECOND + `, expireSeconds, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// GetPreviousKnownRelayLogCoordinatesForInstance returns known relay log coordinates, that are not the +// exact current coordinates +func GetPreviousKnownRelayLogCoordinatesForInstance(instance *Instance) (relayLogCoordinates *BinlogCoordinates, err error) { + query := ` + select + relay_log_file, relay_log_pos + from + database_instance_coordinates_history + where + hostname = ? + and port = ? + and (relay_log_file, relay_log_pos) < (?, ?) + and relay_log_file != '' + and relay_log_pos != 0 + order by + recorded_timestamp desc + limit 1 + ` + err = db.QueryOrchestrator(query, sqlutils.Args( + instance.Key.Hostname, + instance.Key.Port, + instance.RelaylogCoordinates.LogFile, + instance.RelaylogCoordinates.LogPos, + ), func(m sqlutils.RowMap) error { + relayLogCoordinates = &BinlogCoordinates{LogFile: m.GetString("relay_log_file"), LogPos: m.GetInt64("relay_log_pos")} + + return nil + }) + return relayLogCoordinates, err +} + +// ResetInstanceRelaylogCoordinatesHistory forgets about the history of an instance. This action is desirable +// when relay logs become obsolete or irrelevant. Such is the case on `CHANGE MASTER TO`: servers gets compeltely +// new relay logs. +func ResetInstanceRelaylogCoordinatesHistory(instanceKey *InstanceKey) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + update database_instance_coordinates_history + set relay_log_file='', relay_log_pos=0 + where + hostname=? and port=? + `, instanceKey.Hostname, instanceKey.Port, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// FigureClusterName will make a best effort to deduce a cluster name using either a given alias +// or an instanceKey. First attempt is at alias, and if that doesn't work, we try instanceKey. +// - clusterHint may be an empty string +func FigureClusterName(clusterHint string, instanceKey *InstanceKey, thisInstanceKey *InstanceKey) (clusterName string, err error) { + // Look for exact matches, first. + + if clusterHint != "" { + // Exact cluster name match: + if clusterInfo, err := ReadClusterInfo(clusterHint); err == nil && clusterInfo != nil { + return clusterInfo.ClusterName, nil + } + // Exact cluster alias match: + if clustersInfo, err := ReadClustersInfo(""); err == nil { + for _, clusterInfo := range clustersInfo { + if clusterInfo.ClusterAlias == clusterHint { + return clusterInfo.ClusterName, nil + } + } + } + } + + clusterByInstanceKey := func(instanceKey *InstanceKey) (hasResult bool, clusterName string, err error) { + if instanceKey == nil { + return false, "", nil + } + instance, _, err := ReadInstance(instanceKey) + if err != nil { + return true, clusterName, log.Errore(err) + } + if instance != nil { + if instance.ClusterName == "" { + return true, clusterName, log.Errorf("Unable to determine cluster name for %+v, empty cluster name. clusterHint=%+v", instance.Key, clusterHint) + } + return true, instance.ClusterName, nil + } + return false, "", nil + } + // exact instance key: + if hasResult, clusterName, err := clusterByInstanceKey(instanceKey); hasResult { + return clusterName, err + } + // fuzzy instance key: + if hasResult, clusterName, err := clusterByInstanceKey(ReadFuzzyInstanceKeyIfPossible(instanceKey)); hasResult { + return clusterName, err + } + // Let's see about _this_ instance + if hasResult, clusterName, err := clusterByInstanceKey(thisInstanceKey); hasResult { + return clusterName, err + } + return clusterName, log.Errorf("Unable to determine cluster name. clusterHint=%+v", clusterHint) +} + +// FigureInstanceKey tries to figure out a key +func FigureInstanceKey(instanceKey *InstanceKey, thisInstanceKey *InstanceKey) (*InstanceKey, error) { + if figuredKey := ReadFuzzyInstanceKeyIfPossible(instanceKey); figuredKey != nil { + return figuredKey, nil + } + figuredKey := thisInstanceKey + if figuredKey == nil { + return nil, log.Errorf("Cannot deduce instance %+v", instanceKey) + } + return figuredKey, nil +} + +// PopulateGroupReplicationInformation obtains information about Group Replication for this host as well as other hosts +// who are members of the same group (if any). +func PopulateGroupReplicationInformation(instance *Instance, db *sql.DB) error { + q := ` + SELECT + MEMBER_ID, + MEMBER_HOST, + MEMBER_PORT, + MEMBER_STATE, + MEMBER_ROLE, + @@global.group_replication_group_name, + @@global.group_replication_single_primary_mode + FROM + performance_schema.replication_group_members + ` + rows, err := db.Query(q) + if err != nil { + _, grNotSupported := GroupReplicationNotSupportedErrors[err.(*mysql.MySQLError).Number] + if grNotSupported { + return nil // If GR is not supported by the instance, just exit + } else { + // If we got here, the query failed but not because the server does not support group replication. Let's + // log the error + return log.Error("There was an error trying to check group replication information for instance "+ + "%+v: %+v", instance.Key, err) + } + } + defer rows.Close() + foundGroupPrimary := false + // Loop over the query results and populate GR instance attributes from the row that matches the instance being + // probed. In addition, figure out the group primary and also add it as attribute of the instance. + for rows.Next() { + var ( + uuid string + host string + port uint16 + state string + role string + groupName string + singlePrimaryGroup bool + ) + err := rows.Scan(&uuid, &host, &port, &state, &role, &groupName, &singlePrimaryGroup) + if err == nil { + // ToDo: add support for multi primary groups. + if !singlePrimaryGroup { + log.Debugf("This host seems to belong to a multi-primary replication group, which we don't " + + "support") + break + } + groupMemberKey, err := NewResolveInstanceKey(host, int(port)) + if err != nil { + log.Errorf("Unable to resolve instance for group member %v:%v", host, port) + continue + } + // Set the replication group primary from what we find in performance_schema.replication_group_members for + // the instance being discovered. + if role == GroupReplicationMemberRolePrimary && groupMemberKey != nil { + instance.ReplicationGroupPrimaryInstanceKey = *groupMemberKey + foundGroupPrimary = true + } + if uuid == instance.ServerUUID { + instance.ReplicationGroupName = groupName + instance.ReplicationGroupIsSinglePrimary = singlePrimaryGroup + instance.ReplicationGroupMemberRole = role + instance.ReplicationGroupMemberState = state + } else { + instance.AddGroupMemberKey(groupMemberKey) // This helps us keep info on all members of the same group as the instance + } + } else { + log.Errorf("Unable to scan row group replication information while processing %+v, skipping the "+ + "row and continuing: %+v", instance.Key, err) + } + } + // If we did not manage to find the primary of the group in performance_schema.replication_group_members, we are + // likely to have been expelled from the group. Still, try to find out the primary of the group and set it for the + // instance being discovered, so that it is identified as part of the same cluster + if !foundGroupPrimary { + err = ReadReplicationGroupPrimary(instance) + if err != nil { + return log.Errorf("Unable to find the group primary of instance %+v even though it seems to be "+ + "part of a replication group", instance.Key) + } + } + return nil +} + +// RegisterInjectedPseudoGTID +func RegisterInjectedPseudoGTID(clusterName string) error { + query := ` + insert into cluster_injected_pseudo_gtid ( + cluster_name, + time_injected + ) values (?, now()) + on duplicate key update + cluster_name=values(cluster_name), + time_injected=now() + ` + args := sqlutils.Args(clusterName) + writeFunc := func() error { + _, err := db.ExecOrchestrator(query, args...) + if err == nil { + clusterInjectedPseudoGTIDCache.Set(clusterName, true, cache.DefaultExpiration) + } + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// ExpireInjectedPseudoGTID +func ExpireInjectedPseudoGTID() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from cluster_injected_pseudo_gtid + where time_injected < NOW() - INTERVAL ? MINUTE + `, config.PseudoGTIDExpireMinutes, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// isInjectedPseudoGTID reads from backend DB / cache +func isInjectedPseudoGTID(clusterName string) (injected bool, err error) { + if injectedValue, found := clusterInjectedPseudoGTIDCache.Get(clusterName); found { + return injectedValue.(bool), err + } + query := ` + select + count(*) as is_injected + from + cluster_injected_pseudo_gtid + where + cluster_name = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(clusterName), func(m sqlutils.RowMap) error { + injected = m.GetBool("is_injected") + return nil + }) + clusterInjectedPseudoGTIDCache.Set(clusterName, injected, cache.DefaultExpiration) + return injected, log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/instance_dao_test.go b/go/vt/orchestrator/inst/instance_dao_test.go new file mode 100644 index 0000000000..b028054eac --- /dev/null +++ b/go/vt/orchestrator/inst/instance_dao_test.go @@ -0,0 +1,117 @@ +package inst + +import ( + "bytes" + "fmt" + "regexp" + "strings" + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +var ( + i710k = InstanceKey{Hostname: "i710", Port: 3306} + i720k = InstanceKey{Hostname: "i720", Port: 3306} + i730k = InstanceKey{Hostname: "i730", Port: 3306} +) + +var ( + spacesRegexp = regexp.MustCompile(`[ \t\n\r]+`) +) + +func normalizeQuery(name string) string { + name = strings.Replace(name, "`", "", -1) + name = spacesRegexp.ReplaceAllString(name, " ") + name = strings.TrimSpace(name) + return name +} + +func stripSpaces(s string) string { + s = spacesRegexp.ReplaceAllString(s, "") + return s +} + +func mkTestInstances() []*Instance { + i710 := Instance{Key: i710k, ServerID: 710, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 10}} + i720 := Instance{Key: i720k, ServerID: 720, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 20}} + i730 := Instance{Key: i730k, ServerID: 730, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 30}} + instances := []*Instance{&i710, &i720, &i730} + for _, instance := range instances { + instance.Version = "5.6.7" + instance.VersionComment = "MySQL" + instance.Binlog_format = "STATEMENT" + instance.BinlogRowImage = "FULL" + } + return instances +} + +func TestMkInsertOdkuSingle(t *testing.T) { + instances := mkTestInstances() + + sql, args, err := mkInsertOdkuForInstances(nil, true, true) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(sql, "") + test.S(t).ExpectEquals(len(args), 0) + + // one instance + s1 := `INSERT ignore INTO database_instance + (hostname, port, last_checked, last_attempted_check, last_check_partial_success, uptime, server_id, server_uuid, + version, major_version, version_comment, binlog_server, read_only, binlog_format, + binlog_row_image, log_bin, log_slave_updates, binary_log_file, binary_log_pos, master_host, master_port, + slave_sql_running, slave_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, master_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, + master_log_file, read_master_log_pos, relay_master_log_file, exec_master_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, seconds_behind_master, slave_lag_seconds, sql_delay, num_slave_hosts, slave_hosts, cluster_name, suggested_cluster_alias, data_center, region, physical_environment, replication_depth, is_co_master, replication_credentials_available, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_available, semi_sync_master_enabled, semi_sync_master_timeout, semi_sync_master_wait_for_slave_count, semi_sync_replica_enabled, semi_sync_master_status, semi_sync_master_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) + VALUES + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + ON DUPLICATE KEY UPDATE + hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), uptime=VALUES(uptime), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_slave_updates=VALUES(log_slave_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), master_host=VALUES(master_host), master_port=VALUES(master_port), slave_sql_running=VALUES(slave_sql_running), slave_io_running=VALUES(slave_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), master_uuid=VALUES(master_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), master_log_file=VALUES(master_log_file), read_master_log_pos=VALUES(read_master_log_pos), relay_master_log_file=VALUES(relay_master_log_file), exec_master_log_pos=VALUES(exec_master_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), seconds_behind_master=VALUES(seconds_behind_master), slave_lag_seconds=VALUES(slave_lag_seconds), sql_delay=VALUES(sql_delay), num_slave_hosts=VALUES(num_slave_hosts), slave_hosts=VALUES(slave_hosts), cluster_name=VALUES(cluster_name), suggested_cluster_alias=VALUES(suggested_cluster_alias), data_center=VALUES(data_center), region=VALUES(region), physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_master=VALUES(is_co_master), replication_credentials_available=VALUES(replication_credentials_available), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), + semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_available=VALUES(semi_sync_available), semi_sync_master_enabled=VALUES(semi_sync_master_enabled), semi_sync_master_timeout=VALUES(semi_sync_master_timeout), semi_sync_master_wait_for_slave_count=VALUES(semi_sync_master_wait_for_slave_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_master_status=VALUES(semi_sync_master_status), semi_sync_master_clients=VALUES(semi_sync_master_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), + instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) + ` + a1 := `i710, 3306, 0, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, + FULL, false, false, , 0, , 0, + false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, 0, [], , , , , , 0, false, false, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, ` + + sql1, args1, err := mkInsertOdkuForInstances(instances[:1], false, true) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(normalizeQuery(sql1), normalizeQuery(s1)) + test.S(t).ExpectEquals(stripSpaces(fmtArgs(args1)), stripSpaces(a1)) +} + +func TestMkInsertOdkuThree(t *testing.T) { + instances := mkTestInstances() + + // three instances + s3 := `INSERT INTO database_instance + (hostname, port, last_checked, last_attempted_check, last_check_partial_success, uptime, server_id, server_uuid, version, major_version, version_comment, binlog_server, read_only, binlog_format, binlog_row_image, log_bin, log_slave_updates, binary_log_file, binary_log_pos, master_host, master_port, slave_sql_running, slave_io_running, replication_sql_thread_state, replication_io_thread_state, has_replication_filters, supports_oracle_gtid, oracle_gtid, master_uuid, ancestry_uuid, executed_gtid_set, gtid_mode, gtid_purged, gtid_errant, mariadb_gtid, pseudo_gtid, master_log_file, read_master_log_pos, relay_master_log_file, exec_master_log_pos, relay_log_file, relay_log_pos, last_sql_error, last_io_error, seconds_behind_master, slave_lag_seconds, sql_delay, num_slave_hosts, slave_hosts, cluster_name, suggested_cluster_alias, data_center, region, physical_environment, replication_depth, is_co_master, replication_credentials_available, has_replication_credentials, allow_tls, semi_sync_enforced, semi_sync_available, semi_sync_master_enabled, semi_sync_master_timeout, semi_sync_master_wait_for_slave_count, + semi_sync_replica_enabled, semi_sync_master_status, semi_sync_master_clients, semi_sync_replica_status, instance_alias, last_discovery_latency, replication_group_name, replication_group_is_single_primary_mode, replication_group_member_state, replication_group_member_role, replication_group_members, replication_group_primary_host, replication_group_primary_port, last_seen) + VALUES + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()), + (?, ?, NOW(), NOW(), 1, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, NOW()) + ON DUPLICATE KEY UPDATE + hostname=VALUES(hostname), port=VALUES(port), last_checked=VALUES(last_checked), last_attempted_check=VALUES(last_attempted_check), last_check_partial_success=VALUES(last_check_partial_success), uptime=VALUES(uptime), server_id=VALUES(server_id), server_uuid=VALUES(server_uuid), version=VALUES(version), major_version=VALUES(major_version), version_comment=VALUES(version_comment), binlog_server=VALUES(binlog_server), read_only=VALUES(read_only), binlog_format=VALUES(binlog_format), binlog_row_image=VALUES(binlog_row_image), log_bin=VALUES(log_bin), log_slave_updates=VALUES(log_slave_updates), binary_log_file=VALUES(binary_log_file), binary_log_pos=VALUES(binary_log_pos), master_host=VALUES(master_host), master_port=VALUES(master_port), slave_sql_running=VALUES(slave_sql_running), slave_io_running=VALUES(slave_io_running), replication_sql_thread_state=VALUES(replication_sql_thread_state), replication_io_thread_state=VALUES(replication_io_thread_state), has_replication_filters=VALUES(has_replication_filters), supports_oracle_gtid=VALUES(supports_oracle_gtid), oracle_gtid=VALUES(oracle_gtid), master_uuid=VALUES(master_uuid), ancestry_uuid=VALUES(ancestry_uuid), executed_gtid_set=VALUES(executed_gtid_set), gtid_mode=VALUES(gtid_mode), gtid_purged=VALUES(gtid_purged), gtid_errant=VALUES(gtid_errant), mariadb_gtid=VALUES(mariadb_gtid), pseudo_gtid=VALUES(pseudo_gtid), master_log_file=VALUES(master_log_file), read_master_log_pos=VALUES(read_master_log_pos), relay_master_log_file=VALUES(relay_master_log_file), exec_master_log_pos=VALUES(exec_master_log_pos), relay_log_file=VALUES(relay_log_file), relay_log_pos=VALUES(relay_log_pos), last_sql_error=VALUES(last_sql_error), last_io_error=VALUES(last_io_error), seconds_behind_master=VALUES(seconds_behind_master), slave_lag_seconds=VALUES(slave_lag_seconds), sql_delay=VALUES(sql_delay), num_slave_hosts=VALUES(num_slave_hosts), slave_hosts=VALUES(slave_hosts), cluster_name=VALUES(cluster_name), suggested_cluster_alias=VALUES(suggested_cluster_alias), data_center=VALUES(data_center), region=VALUES(region), + physical_environment=VALUES(physical_environment), replication_depth=VALUES(replication_depth), is_co_master=VALUES(is_co_master), replication_credentials_available=VALUES(replication_credentials_available), has_replication_credentials=VALUES(has_replication_credentials), allow_tls=VALUES(allow_tls), semi_sync_enforced=VALUES(semi_sync_enforced), semi_sync_available=VALUES(semi_sync_available), + semi_sync_master_enabled=VALUES(semi_sync_master_enabled), semi_sync_master_timeout=VALUES(semi_sync_master_timeout), semi_sync_master_wait_for_slave_count=VALUES(semi_sync_master_wait_for_slave_count), semi_sync_replica_enabled=VALUES(semi_sync_replica_enabled), semi_sync_master_status=VALUES(semi_sync_master_status), semi_sync_master_clients=VALUES(semi_sync_master_clients), semi_sync_replica_status=VALUES(semi_sync_replica_status), + instance_alias=VALUES(instance_alias), last_discovery_latency=VALUES(last_discovery_latency), replication_group_name=VALUES(replication_group_name), replication_group_is_single_primary_mode=VALUES(replication_group_is_single_primary_mode), replication_group_member_state=VALUES(replication_group_member_state), replication_group_member_role=VALUES(replication_group_member_role), replication_group_members=VALUES(replication_group_members), replication_group_primary_host=VALUES(replication_group_primary_host), replication_group_primary_port=VALUES(replication_group_primary_port), last_seen=VALUES(last_seen) + ` + a3 := ` + i710, 3306, 0, 710, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 10, , 0, , , {0 false}, {0 false}, 0, 0, [], , , , , , 0, false, false, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + i720, 3306, 0, 720, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 20, , 0, , , {0 false}, {0 false}, 0, 0, [], , , , , , 0, false, false, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + i730, 3306, 0, 730, , 5.6.7, 5.6, MySQL, false, false, STATEMENT, FULL, false, false, , 0, , 0, false, false, 0, 0, false, false, false, , , , , , , false, false, , 0, mysql.000007, 30, , 0, , , {0 false}, {0 false}, 0, 0, [], , , , , , 0, false, false, false, false, false, false, false, 0, 0, false, false, 0, false, , 0, , false, , , [], , 0, + ` + + sql3, args3, err := mkInsertOdkuForInstances(instances[:3], true, true) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(normalizeQuery(sql3), normalizeQuery(s3)) + test.S(t).ExpectEquals(stripSpaces(fmtArgs(args3)), stripSpaces(a3)) +} + +func fmtArgs(args []interface{}) string { + b := &bytes.Buffer{} + for _, a := range args { + fmt.Fprint(b, a) + fmt.Fprint(b, ", ") + } + return b.String() +} diff --git a/go/vt/orchestrator/inst/instance_key.go b/go/vt/orchestrator/inst/instance_key.go new file mode 100644 index 0000000000..3714f7db77 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_key.go @@ -0,0 +1,191 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "strconv" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/config" +) + +// InstanceKey is an instance indicator, identifued by hostname and port +type InstanceKey struct { + Hostname string + Port int +} + +var ( + ipv4Regexp = regexp.MustCompile("^([0-9]+)[.]([0-9]+)[.]([0-9]+)[.]([0-9]+)$") + ipv4HostPortRegexp = regexp.MustCompile("^([^:]+):([0-9]+)$") + ipv4HostRegexp = regexp.MustCompile("^([^:]+)$") + ipv6HostPortRegexp = regexp.MustCompile("^\\[([:0-9a-fA-F]+)\\]:([0-9]+)$") // e.g. [2001:db8:1f70::999:de8:7648:6e8]:3308 + ipv6HostRegexp = regexp.MustCompile("^([:0-9a-fA-F]+)$") // e.g. 2001:db8:1f70::999:de8:7648:6e8 +) + +const detachHint = "//" + +func newInstanceKey(hostname string, port int, resolve bool) (instanceKey *InstanceKey, err error) { + if hostname == "" { + return instanceKey, fmt.Errorf("NewResolveInstanceKey: Empty hostname") + } + + instanceKey = &InstanceKey{Hostname: hostname, Port: port} + if resolve { + instanceKey, err = instanceKey.ResolveHostname() + } + return instanceKey, err +} + +// newInstanceKeyStrings +func newInstanceKeyStrings(hostname string, port string, resolve bool) (*InstanceKey, error) { + if portInt, err := strconv.Atoi(port); err != nil { + return nil, fmt.Errorf("Invalid port: %s", port) + } else { + return newInstanceKey(hostname, portInt, resolve) + } +} +func parseRawInstanceKey(hostPort string, resolve bool) (instanceKey *InstanceKey, err error) { + hostname := "" + port := "" + if submatch := ipv4HostPortRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { + hostname = submatch[1] + port = submatch[2] + } else if submatch := ipv4HostRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { + hostname = submatch[1] + } else if submatch := ipv6HostPortRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { + hostname = submatch[1] + port = submatch[2] + } else if submatch := ipv6HostRegexp.FindStringSubmatch(hostPort); len(submatch) > 0 { + hostname = submatch[1] + } else { + return nil, fmt.Errorf("Cannot parse address: %s", hostPort) + } + if port == "" { + port = fmt.Sprintf("%d", config.Config.DefaultInstancePort) + } + return newInstanceKeyStrings(hostname, port, resolve) +} + +func NewResolveInstanceKey(hostname string, port int) (instanceKey *InstanceKey, err error) { + return newInstanceKey(hostname, port, true) +} + +// NewResolveInstanceKeyStrings creates and resolves a new instance key based on string params +func NewResolveInstanceKeyStrings(hostname string, port string) (*InstanceKey, error) { + return newInstanceKeyStrings(hostname, port, true) +} + +func ParseResolveInstanceKey(hostPort string) (instanceKey *InstanceKey, err error) { + return parseRawInstanceKey(hostPort, true) +} + +func ParseRawInstanceKey(hostPort string) (instanceKey *InstanceKey, err error) { + return parseRawInstanceKey(hostPort, false) +} + +// NewResolveInstanceKeyStrings creates and resolves a new instance key based on string params +func NewRawInstanceKeyStrings(hostname string, port string) (*InstanceKey, error) { + return newInstanceKeyStrings(hostname, port, false) +} + +// +func (this *InstanceKey) ResolveHostname() (*InstanceKey, error) { + if !this.IsValid() { + return this, nil + } + + hostname, err := ResolveHostname(this.Hostname) + if err == nil { + this.Hostname = hostname + } + return this, err +} + +// Equals tests equality between this key and another key +func (this *InstanceKey) Equals(other *InstanceKey) bool { + if other == nil { + return false + } + return this.Hostname == other.Hostname && this.Port == other.Port +} + +// SmallerThan returns true if this key is dictionary-smaller than another. +// This is used for consistent sorting/ordering; there's nothing magical about it. +func (this *InstanceKey) SmallerThan(other *InstanceKey) bool { + if this.Hostname < other.Hostname { + return true + } + if this.Hostname == other.Hostname && this.Port < other.Port { + return true + } + return false +} + +// IsDetached returns 'true' when this hostname is logically "detached" +func (this *InstanceKey) IsDetached() bool { + return strings.HasPrefix(this.Hostname, detachHint) +} + +// IsValid uses simple heuristics to see whether this key represents an actual instance +func (this *InstanceKey) IsValid() bool { + if this.Hostname == "_" { + return false + } + if this.IsDetached() { + return false + } + return len(this.Hostname) > 0 && this.Port > 0 +} + +// DetachedKey returns an instance key whose hostname is detahced: invalid, but recoverable +func (this *InstanceKey) DetachedKey() *InstanceKey { + if this.IsDetached() { + return this + } + return &InstanceKey{Hostname: fmt.Sprintf("%s%s", detachHint, this.Hostname), Port: this.Port} +} + +// ReattachedKey returns an instance key whose hostname is detahced: invalid, but recoverable +func (this *InstanceKey) ReattachedKey() *InstanceKey { + if !this.IsDetached() { + return this + } + return &InstanceKey{Hostname: this.Hostname[len(detachHint):], Port: this.Port} +} + +// StringCode returns an official string representation of this key +func (this *InstanceKey) StringCode() string { + return fmt.Sprintf("%s:%d", this.Hostname, this.Port) +} + +// DisplayString returns a user-friendly string representation of this key +func (this *InstanceKey) DisplayString() string { + return this.StringCode() +} + +// String returns a user-friendly string representation of this key +func (this InstanceKey) String() string { + return this.StringCode() +} + +// IsValid uses simple heuristics to see whether this key represents an actual instance +func (this *InstanceKey) IsIPv4() bool { + return ipv4Regexp.MatchString(this.Hostname) +} diff --git a/go/vt/orchestrator/inst/instance_key_map.go b/go/vt/orchestrator/inst/instance_key_map.go new file mode 100644 index 0000000000..ea258c1a11 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_key_map.go @@ -0,0 +1,141 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "encoding/json" + "sort" + "strings" +) + +// InstanceKeyMap is a convenience struct for listing InstanceKey-s +type InstanceKeyMap map[InstanceKey]bool + +func NewInstanceKeyMap() *InstanceKeyMap { + return &InstanceKeyMap{} +} + +// AddKey adds a single key to this map +func (this *InstanceKeyMap) AddKey(key InstanceKey) { + (*this)[key] = true +} + +// AddKeys adds all given keys to this map +func (this *InstanceKeyMap) AddKeys(keys []InstanceKey) { + for _, key := range keys { + this.AddKey(key) + } +} + +// AddInstances adds keys of all given instances to this map +func (this *InstanceKeyMap) AddInstances(instances [](*Instance)) { + for _, instance := range instances { + this.AddKey(instance.Key) + } +} + +// HasKey checks if given key is within the map +func (this *InstanceKeyMap) HasKey(key InstanceKey) bool { + _, ok := (*this)[key] + return ok +} + +// GetInstanceKeys returns keys in this map in the form of an array +func (this *InstanceKeyMap) GetInstanceKeys() []InstanceKey { + res := []InstanceKey{} + for key := range *this { + res = append(res, key) + } + sort.Slice(res, func(i, j int) bool { + return res[i].Hostname < res[j].Hostname || res[i].Hostname == res[j].Hostname && res[i].Port < res[j].Port + }) + return res +} + +// Intersect returns a keymap which is the intersection of this and another map +func (this *InstanceKeyMap) Intersect(other *InstanceKeyMap) *InstanceKeyMap { + intersected := NewInstanceKeyMap() + for key := range *other { + if this.HasKey(key) { + intersected.AddKey(key) + } + } + return intersected +} + +// MarshalJSON will marshal this map as JSON +func (this InstanceKeyMap) MarshalJSON() ([]byte, error) { + return json.Marshal(this.GetInstanceKeys()) +} + +// UnmarshalJSON reds this object from JSON +func (this *InstanceKeyMap) UnmarshalJSON(b []byte) error { + var keys []InstanceKey + if err := json.Unmarshal(b, &keys); err != nil { + return err + } + *this = make(InstanceKeyMap) + for _, key := range keys { + this.AddKey(key) + } + return nil +} + +// ToJSON will marshal this map as JSON +func (this *InstanceKeyMap) ToJSON() (string, error) { + bytes, err := this.MarshalJSON() + return string(bytes), err +} + +// ToJSONString will marshal this map as JSON +func (this *InstanceKeyMap) ToJSONString() string { + s, _ := this.ToJSON() + return s +} + +// ToCommaDelimitedList will export this map in comma delimited format +func (this *InstanceKeyMap) ToCommaDelimitedList() string { + keyDisplays := []string{} + for key := range *this { + keyDisplays = append(keyDisplays, key.DisplayString()) + } + return strings.Join(keyDisplays, ",") +} + +// ReadJson unmarshalls a json into this map +func (this *InstanceKeyMap) ReadJson(jsonString string) error { + var keys []InstanceKey + err := json.Unmarshal([]byte(jsonString), &keys) + if err != nil { + return err + } + this.AddKeys(keys) + return err +} + +// ReadJson unmarshalls a json into this map +func (this *InstanceKeyMap) ReadCommaDelimitedList(list string) error { + tokens := strings.Split(list, ",") + for _, token := range tokens { + key, err := ParseResolveInstanceKey(token) + if err != nil { + return err + } + this.AddKey(*key) + } + return nil +} diff --git a/go/vt/orchestrator/inst/instance_key_map_test.go b/go/vt/orchestrator/inst/instance_key_map_test.go new file mode 100644 index 0000000000..a3c33e4065 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_key_map_test.go @@ -0,0 +1,127 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "math/rand" + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func TestGetInstanceKeys(t *testing.T) { + for range rand.Perm(10) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. + m := *NewInstanceKeyMap() + m.AddKey(key1) + m.AddKey(key2) + keys := m.GetInstanceKeys() + test.S(t).ExpectEquals(keys[0], key1) + test.S(t).ExpectEquals(keys[1], key2) + } + for range rand.Perm(10) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. + m := *NewInstanceKeyMap() + m.AddKey(key2) + m.AddKey(key1) + keys := m.GetInstanceKeys() + test.S(t).ExpectEquals(keys[0], key1) + test.S(t).ExpectEquals(keys[1], key2) + } +} + +func TestInstanceKeyMapToJSON(t *testing.T) { + m := *NewInstanceKeyMap() + m.AddKey(key1) + m.AddKey(key2) + json, err := m.ToJSON() + test.S(t).ExpectNil(err) + ok := (json == `[{"Hostname":"host1","Port":3306},{"Hostname":"host2","Port":3306}]`) || (json == `[{"Hostname":"host2","Port":3306},{"Hostname":"host1","Port":3306}]`) + test.S(t).ExpectTrue(ok) +} + +func TestInstanceKeyMapReadJSON(t *testing.T) { + json := `[{"Hostname":"host1","Port":3306},{"Hostname":"host2","Port":3306}]` + m := *NewInstanceKeyMap() + m.ReadJson(json) + test.S(t).ExpectEquals(len(m), 2) + test.S(t).ExpectTrue(m[key1]) + test.S(t).ExpectTrue(m[key2]) +} + +func TestEmptyInstanceKeyMapToCommaDelimitedList(t *testing.T) { + m := *NewInstanceKeyMap() + res := m.ToCommaDelimitedList() + + test.S(t).ExpectEquals(res, "") +} + +func TestInstanceKeyMapToCommaDelimitedList(t *testing.T) { + m := *NewInstanceKeyMap() + m.AddKey(key1) + m.AddKey(key2) + res := m.ToCommaDelimitedList() + + ok := (res == `host1:3306,host2:3306`) || (res == `host2:3306,host1:3306`) + test.S(t).ExpectTrue(ok) +} + +func TestIntersect(t *testing.T) { + { + m := NewInstanceKeyMap() + m.AddKey(key1) + m.AddKey(key2) + + other := NewInstanceKeyMap() + other.AddKey(key3) + other.AddKey(key2) + + intersected := m.Intersect(other) + test.S(t).ExpectEquals(len(*intersected), 1) + } + { + m := NewInstanceKeyMap() + m.AddKey(key1) + + other := NewInstanceKeyMap() + other.AddKey(key3) + other.AddKey(key2) + + intersected := m.Intersect(other) + test.S(t).ExpectEquals(len(*intersected), 0) + } + { + m := NewInstanceKeyMap() + m.AddKey(key1) + m.AddKey(key2) + + other := NewInstanceKeyMap() + other.AddKey(key1) + other.AddKey(key3) + other.AddKey(key2) + + intersected := m.Intersect(other) + test.S(t).ExpectEquals(len(*intersected), 2) + } + +} diff --git a/go/vt/orchestrator/inst/instance_key_test.go b/go/vt/orchestrator/inst/instance_key_test.go new file mode 100644 index 0000000000..d6290ac53e --- /dev/null +++ b/go/vt/orchestrator/inst/instance_key_test.go @@ -0,0 +1,211 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +var key1 = InstanceKey{Hostname: "host1", Port: 3306} +var key2 = InstanceKey{Hostname: "host2", Port: 3306} +var key3 = InstanceKey{Hostname: "host3", Port: 3306} + +func TestInstanceKeyEquals(t *testing.T) { + i1 := Instance{ + Key: InstanceKey{ + Hostname: "sql00.db", + Port: 3306, + }, + Version: "5.6", + } + i2 := Instance{ + Key: InstanceKey{ + Hostname: "sql00.db", + Port: 3306, + }, + Version: "5.5", + } + + test.S(t).ExpectEquals(i1.Key, i2.Key) + + i2.Key.Port = 3307 + test.S(t).ExpectNotEquals(i1.Key, i2.Key) +} + +func TestNewResolveInstanceKey(t *testing.T) { + { + i, err := NewResolveInstanceKey("127.0.0.1", 3308) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(i.Hostname, "127.0.0.1") + test.S(t).ExpectEquals(i.Port, 3308) + } + { + _, err := NewResolveInstanceKey("", 3309) + test.S(t).ExpectNotNil(err) + } + { + i, err := NewResolveInstanceKey("127.0.0.1", 0) + test.S(t).ExpectNil(err) + test.S(t).ExpectFalse(i.IsValid()) + } +} + +func TestParseResolveInstanceKey(t *testing.T) { + { + key, err := ParseResolveInstanceKey("myhost:1234") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "myhost") + test.S(t).ExpectEquals(key.Port, 1234) + } + { + key, err := ParseResolveInstanceKey("myhost") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "myhost") + test.S(t).ExpectEquals(key.Port, 3306) + } + { + key, err := ParseResolveInstanceKey("10.0.0.3:3307") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "10.0.0.3") + test.S(t).ExpectEquals(key.Port, 3307) + } + { + key, err := ParseResolveInstanceKey("10.0.0.3") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "10.0.0.3") + test.S(t).ExpectEquals(key.Port, 3306) + } + { + key, err := ParseResolveInstanceKey("[2001:db8:1f70::999:de8:7648:6e8]:3308") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "2001:db8:1f70::999:de8:7648:6e8") + test.S(t).ExpectEquals(key.Port, 3308) + } + { + key, err := ParseResolveInstanceKey("::1") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "::1") + test.S(t).ExpectEquals(key.Port, 3306) + } + { + key, err := ParseResolveInstanceKey("0:0:0:0:0:0:0:0") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(key.Hostname, "0:0:0:0:0:0:0:0") + test.S(t).ExpectEquals(key.Port, 3306) + } + { + _, err := ParseResolveInstanceKey("[2001:xxxx:1f70::999:de8:7648:6e8]:3308") + test.S(t).ExpectNotNil(err) + } + { + _, err := ParseResolveInstanceKey("10.0.0.4:") + test.S(t).ExpectNotNil(err) + } + { + _, err := ParseResolveInstanceKey("10.0.0.4:5.6.7") + test.S(t).ExpectNotNil(err) + } +} + +func TestNewResolveInstanceKeyStrings(t *testing.T) { + { + i, err := NewResolveInstanceKeyStrings("127.0.0.1", "3306") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(i.Hostname, "127.0.0.1") + test.S(t).ExpectEquals(i.Port, 3306) + } + { + _, err := NewResolveInstanceKeyStrings("127.0.0.1", "") + test.S(t).ExpectNotNil(err) + } + { + _, err := NewResolveInstanceKeyStrings("127.0.0.1", "3306x") + test.S(t).ExpectNotNil(err) + } +} + +func TestInstanceKeyValid(t *testing.T) { + test.S(t).ExpectTrue(key1.IsValid()) + i, err := ParseResolveInstanceKey("_:3306") + test.S(t).ExpectNil(err) + test.S(t).ExpectFalse(i.IsValid()) + i, err = ParseResolveInstanceKey("//myhost:3306") + test.S(t).ExpectNil(err) + test.S(t).ExpectFalse(i.IsValid()) +} + +func TestInstanceKeyDetach(t *testing.T) { + test.S(t).ExpectFalse(key1.IsDetached()) + detached1 := key1.DetachedKey() + test.S(t).ExpectTrue(detached1.IsDetached()) + detached2 := key1.DetachedKey() + test.S(t).ExpectTrue(detached2.IsDetached()) + test.S(t).ExpectTrue(detached1.Equals(detached2)) + + reattached1 := detached1.ReattachedKey() + test.S(t).ExpectFalse(reattached1.IsDetached()) + test.S(t).ExpectTrue(reattached1.Equals(&key1)) + reattached2 := reattached1.ReattachedKey() + test.S(t).ExpectFalse(reattached2.IsDetached()) + test.S(t).ExpectTrue(reattached1.Equals(reattached2)) +} + +func TestIsIPv4(t *testing.T) { + test.S(t).ExpectFalse(key1.IsIPv4()) + { + k, _ := ParseRawInstanceKey("mysql-server-1:3306") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("mysql-server-1") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("my.sql.server.1") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("mysql-server-1:3306") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("127.0.0:3306") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("127::0::0::1:3306") + test.S(t).ExpectFalse(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("127.0.0.1:3306") + test.S(t).ExpectTrue(k.IsIPv4()) + } + { + k, _ := ParseRawInstanceKey("127.0.0.1") + test.S(t).ExpectTrue(k.IsIPv4()) + } +} diff --git a/go/vt/orchestrator/inst/instance_test.go b/go/vt/orchestrator/inst/instance_test.go new file mode 100644 index 0000000000..e47c745860 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_test.go @@ -0,0 +1,241 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +var instance1 = Instance{Key: key1} +var instance2 = Instance{Key: key2} +var instance3 = Instance{Key: key3} + +func TestIsSmallerMajorVersion(t *testing.T) { + i55 := Instance{Version: "5.5"} + i5517 := Instance{Version: "5.5.17"} + i56 := Instance{Version: "5.6"} + + test.S(t).ExpectFalse(i55.IsSmallerMajorVersion(&i5517)) + test.S(t).ExpectFalse(i56.IsSmallerMajorVersion(&i5517)) + test.S(t).ExpectTrue(i55.IsSmallerMajorVersion(&i56)) +} + +func TestIsVersion(t *testing.T) { + i51 := Instance{Version: "5.1.19"} + i55 := Instance{Version: "5.5.17-debug"} + i56 := Instance{Version: "5.6.20"} + i57 := Instance{Version: "5.7.8-log"} + + test.S(t).ExpectTrue(i51.IsMySQL51()) + test.S(t).ExpectTrue(i55.IsMySQL55()) + test.S(t).ExpectTrue(i56.IsMySQL56()) + test.S(t).ExpectFalse(i55.IsMySQL56()) + test.S(t).ExpectTrue(i57.IsMySQL57()) + test.S(t).ExpectFalse(i56.IsMySQL57()) +} + +func TestIsSmallerBinlogFormat(t *testing.T) { + iStatement := &Instance{Key: key1, Binlog_format: "STATEMENT"} + iRow := &Instance{Key: key2, Binlog_format: "ROW"} + iMixed := &Instance{Key: key3, Binlog_format: "MIXED"} + test.S(t).ExpectTrue(iStatement.IsSmallerBinlogFormat(iRow)) + test.S(t).ExpectFalse(iStatement.IsSmallerBinlogFormat(iStatement)) + test.S(t).ExpectFalse(iRow.IsSmallerBinlogFormat(iStatement)) + + test.S(t).ExpectTrue(iStatement.IsSmallerBinlogFormat(iMixed)) + test.S(t).ExpectTrue(iMixed.IsSmallerBinlogFormat(iRow)) + test.S(t).ExpectFalse(iMixed.IsSmallerBinlogFormat(iStatement)) + test.S(t).ExpectFalse(iRow.IsSmallerBinlogFormat(iMixed)) +} + +func TestIsDescendant(t *testing.T) { + { + i57 := Instance{Key: key1, Version: "5.7"} + i56 := Instance{Key: key2, Version: "5.6"} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, false) + } + { + i57 := Instance{Key: key1, Version: "5.7", AncestryUUID: "00020192-1111-1111-1111-111111111111"} + i56 := Instance{Key: key2, Version: "5.6", ServerUUID: ""} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, false) + } + { + i57 := Instance{Key: key1, Version: "5.7", AncestryUUID: ""} + i56 := Instance{Key: key2, Version: "5.6", ServerUUID: "00020192-1111-1111-1111-111111111111"} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, false) + } + { + i57 := Instance{Key: key1, Version: "5.7", AncestryUUID: "00020193-2222-2222-2222-222222222222"} + i56 := Instance{Key: key2, Version: "5.6", ServerUUID: "00020192-1111-1111-1111-111111111111"} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, false) + } + { + i57 := Instance{Key: key1, Version: "5.7", AncestryUUID: "00020193-2222-2222-2222-222222222222,00020193-3333-3333-3333-222222222222"} + i56 := Instance{Key: key2, Version: "5.6", ServerUUID: "00020192-1111-1111-1111-111111111111"} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, false) + } + { + i57 := Instance{Key: key1, Version: "5.7", AncestryUUID: "00020193-2222-2222-2222-222222222222,00020192-1111-1111-1111-111111111111"} + i56 := Instance{Key: key2, Version: "5.6", ServerUUID: "00020192-1111-1111-1111-111111111111"} + isDescendant := i57.IsDescendantOf(&i56) + test.S(t).ExpectEquals(isDescendant, true) + } +} + +func TestCanReplicateFrom(t *testing.T) { + i55 := Instance{Key: key1, Version: "5.5"} + i56 := Instance{Key: key2, Version: "5.6"} + + var canReplicate bool + canReplicate, _ = i56.CanReplicateFrom(&i55) + test.S(t).ExpectEquals(canReplicate, false) //binlog not yet enabled + + i55.LogBinEnabled = true + i55.LogReplicationUpdatesEnabled = true + i56.LogBinEnabled = true + i56.LogReplicationUpdatesEnabled = true + + canReplicate, _ = i56.CanReplicateFrom(&i55) + test.S(t).ExpectEquals(canReplicate, false) //serverid not set + i55.ServerID = 55 + i56.ServerID = 56 + + canReplicate, err := i56.CanReplicateFrom(&i55) + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(canReplicate) + canReplicate, _ = i55.CanReplicateFrom(&i56) + test.S(t).ExpectFalse(canReplicate) + + iStatement := Instance{Key: key1, Binlog_format: "STATEMENT", ServerID: 1, Version: "5.5", LogBinEnabled: true, LogReplicationUpdatesEnabled: true} + iRow := Instance{Key: key2, Binlog_format: "ROW", ServerID: 2, Version: "5.5", LogBinEnabled: true, LogReplicationUpdatesEnabled: true} + canReplicate, err = iRow.CanReplicateFrom(&iStatement) + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(canReplicate) + canReplicate, _ = iStatement.CanReplicateFrom(&iRow) + test.S(t).ExpectFalse(canReplicate) +} + +func TestNextGTID(t *testing.T) { + { + i := Instance{ExecutedGtidSet: "4f6d62ed-df65-11e3-b395-60672090eb04:1,b9b4712a-df64-11e3-b391-60672090eb04:1-6"} + nextGTID, err := i.NextGTID() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(nextGTID, "b9b4712a-df64-11e3-b391-60672090eb04:7") + } + { + i := Instance{ExecutedGtidSet: "b9b4712a-df64-11e3-b391-60672090eb04:1-6"} + nextGTID, err := i.NextGTID() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(nextGTID, "b9b4712a-df64-11e3-b391-60672090eb04:7") + } + { + i := Instance{ExecutedGtidSet: "b9b4712a-df64-11e3-b391-60672090eb04:6"} + nextGTID, err := i.NextGTID() + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(nextGTID, "b9b4712a-df64-11e3-b391-60672090eb04:7") + } +} + +func TestRemoveInstance(t *testing.T) { + { + instances := [](*Instance){&instance1, &instance2} + test.S(t).ExpectEquals(len(instances), 2) + instances = RemoveNilInstances(instances) + test.S(t).ExpectEquals(len(instances), 2) + } + { + instances := [](*Instance){&instance1, nil, &instance2} + test.S(t).ExpectEquals(len(instances), 3) + instances = RemoveNilInstances(instances) + test.S(t).ExpectEquals(len(instances), 2) + } + { + instances := [](*Instance){&instance1, &instance2} + test.S(t).ExpectEquals(len(instances), 2) + instances = RemoveInstance(instances, &key1) + test.S(t).ExpectEquals(len(instances), 1) + instances = RemoveInstance(instances, &key1) + test.S(t).ExpectEquals(len(instances), 1) + instances = RemoveInstance(instances, &key2) + test.S(t).ExpectEquals(len(instances), 0) + instances = RemoveInstance(instances, &key2) + test.S(t).ExpectEquals(len(instances), 0) + } +} + +func TestHumanReadableDescription(t *testing.T) { + i57 := Instance{Version: "5.7.8-log"} + { + desc := i57.HumanReadableDescription() + test.S(t).ExpectEquals(desc, "[unknown,invalid,5.7.8-log,rw,nobinlog]") + } + { + i57.UsingPseudoGTID = true + i57.LogBinEnabled = true + i57.Binlog_format = "ROW" + i57.LogReplicationUpdatesEnabled = true + desc := i57.HumanReadableDescription() + test.S(t).ExpectEquals(desc, "[unknown,invalid,5.7.8-log,rw,ROW,>>,P-GTID]") + } +} + +func TestTabulatedDescription(t *testing.T) { + i57 := Instance{Version: "5.7.8-log"} + { + desc := i57.TabulatedDescription("|") + test.S(t).ExpectEquals(desc, "unknown|invalid|5.7.8-log|rw|nobinlog|") + } + { + i57.UsingPseudoGTID = true + i57.LogBinEnabled = true + i57.Binlog_format = "ROW" + i57.LogReplicationUpdatesEnabled = true + desc := i57.TabulatedDescription("|") + test.S(t).ExpectEquals(desc, "unknown|invalid|5.7.8-log|rw|ROW|>>,P-GTID") + } +} + +func TestReplicationThreads(t *testing.T) { + { + test.S(t).ExpectFalse(instance1.ReplicaRunning()) + } + { + test.S(t).ExpectTrue(instance1.ReplicationThreadsExist()) + } + { + test.S(t).ExpectTrue(instance1.ReplicationThreadsStopped()) + } + { + i := Instance{Key: key1, ReplicationIOThreadState: ReplicationThreadStateNoThread, ReplicationSQLThreadState: ReplicationThreadStateNoThread} + test.S(t).ExpectFalse(i.ReplicationThreadsExist()) + } +} diff --git a/go/vt/orchestrator/inst/instance_topology.go b/go/vt/orchestrator/inst/instance_topology.go new file mode 100644 index 0000000000..fd43900f78 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_topology.go @@ -0,0 +1,2903 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + goos "os" + "regexp" + "sort" + "strings" + "sync" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/math" + "vitess.io/vitess/go/vt/orchestrator/external/golib/util" + "vitess.io/vitess/go/vt/orchestrator/os" +) + +type StopReplicationMethod string + +const ( + NoStopReplication StopReplicationMethod = "NoStopReplication" + StopReplicationNormal = "StopReplicationNormal" + StopReplicationNice = "StopReplicationNice" +) + +var ReplicationNotRunningError = fmt.Errorf("Replication not running") + +var asciiFillerCharacter = " " +var tabulatorScharacter = "|" + +var countRetries = 5 + +// getASCIITopologyEntry will get an ascii topology tree rooted at given instance. Ir recursively +// draws the tree +func getASCIITopologyEntry(depth int, instance *Instance, replicationMap map[*Instance]([]*Instance), extendedOutput bool, fillerCharacter string, tabulated bool, printTags bool) []string { + if instance == nil { + return []string{} + } + if instance.IsCoMaster && depth > 1 { + return []string{} + } + prefix := "" + if depth > 0 { + prefix = strings.Repeat(fillerCharacter, (depth-1)*2) + if instance.ReplicaRunning() && instance.IsLastCheckValid && instance.IsRecentlyChecked { + prefix += "+" + fillerCharacter + } else { + prefix += "-" + fillerCharacter + } + } + entryAlias := "" + if instance.InstanceAlias != "" { + entryAlias = fmt.Sprintf(" (%s)", instance.InstanceAlias) + } + entry := fmt.Sprintf("%s%s%s", prefix, instance.Key.DisplayString(), entryAlias) + if extendedOutput { + if tabulated { + entry = fmt.Sprintf("%s%s%s", entry, tabulatorScharacter, instance.TabulatedDescription(tabulatorScharacter)) + } else { + entry = fmt.Sprintf("%s%s%s", entry, fillerCharacter, instance.HumanReadableDescription()) + } + if printTags { + tags, _ := ReadInstanceTags(&instance.Key) + tagsString := make([]string, len(tags)) + for idx, tag := range tags { + tagsString[idx] = tag.Display() + } + entry = fmt.Sprintf("%s [%s]", entry, strings.Join(tagsString, ",")) + } + } + result := []string{entry} + for _, replica := range replicationMap[instance] { + replicasResult := getASCIITopologyEntry(depth+1, replica, replicationMap, extendedOutput, fillerCharacter, tabulated, printTags) + result = append(result, replicasResult...) + } + return result +} + +// ASCIITopology returns a string representation of the topology of given cluster. +func ASCIITopology(clusterName string, historyTimestampPattern string, tabulated bool, printTags bool) (result string, err error) { + fillerCharacter := asciiFillerCharacter + var instances [](*Instance) + if historyTimestampPattern == "" { + instances, err = ReadClusterInstances(clusterName) + } else { + instances, err = ReadHistoryClusterInstances(clusterName, historyTimestampPattern) + } + if err != nil { + return "", err + } + + instancesMap := make(map[InstanceKey](*Instance)) + for _, instance := range instances { + log.Debugf("instanceKey: %+v", instance.Key) + instancesMap[instance.Key] = instance + } + + replicationMap := make(map[*Instance]([]*Instance)) + var masterInstance *Instance + // Investigate replicas: + for _, instance := range instances { + master, ok := instancesMap[instance.MasterKey] + if ok { + if _, ok := replicationMap[master]; !ok { + replicationMap[master] = [](*Instance){} + } + replicationMap[master] = append(replicationMap[master], instance) + } else { + masterInstance = instance + } + } + // Get entries: + var entries []string + if masterInstance != nil { + // Single master + entries = getASCIITopologyEntry(0, masterInstance, replicationMap, historyTimestampPattern == "", fillerCharacter, tabulated, printTags) + } else { + // Co-masters? For visualization we put each in its own branch while ignoring its other co-masters. + for _, instance := range instances { + if instance.IsCoMaster { + entries = append(entries, getASCIITopologyEntry(1, instance, replicationMap, historyTimestampPattern == "", fillerCharacter, tabulated, printTags)...) + } + } + } + // Beautify: make sure the "[...]" part is nicely aligned for all instances. + if tabulated { + entries = util.Tabulate(entries, "|", "|", util.TabulateLeft, util.TabulateRight) + } else { + indentationCharacter := "[" + maxIndent := 0 + for _, entry := range entries { + maxIndent = math.MaxInt(maxIndent, strings.Index(entry, indentationCharacter)) + } + for i, entry := range entries { + entryIndent := strings.Index(entry, indentationCharacter) + if maxIndent > entryIndent { + tokens := strings.SplitN(entry, indentationCharacter, 2) + newEntry := fmt.Sprintf("%s%s%s%s", tokens[0], strings.Repeat(fillerCharacter, maxIndent-entryIndent), indentationCharacter, tokens[1]) + entries[i] = newEntry + } + } + } + // Turn into string + result = strings.Join(entries, "\n") + return result, nil +} + +func shouldPostponeRelocatingReplica(replica *Instance, postponedFunctionsContainer *PostponedFunctionsContainer) bool { + if postponedFunctionsContainer == nil { + return false + } + if config.Config.PostponeReplicaRecoveryOnLagMinutes > 0 && + replica.SQLDelay > config.Config.PostponeReplicaRecoveryOnLagMinutes*60 { + // This replica is lagging very much, AND + // we're configured to postpone operation on this replica so as not to delay everyone else. + return true + } + if replica.LastDiscoveryLatency > ReasonableDiscoveryLatency { + return true + } + return false +} + +// GetInstanceMaster synchronously reaches into the replication topology +// and retrieves master's data +func GetInstanceMaster(instance *Instance) (*Instance, error) { + master, err := ReadTopologyInstance(&instance.MasterKey) + return master, err +} + +// InstancesAreSiblings checks whether both instances are replicating from same master +func InstancesAreSiblings(instance0, instance1 *Instance) bool { + if !instance0.IsReplica() { + return false + } + if !instance1.IsReplica() { + return false + } + if instance0.Key.Equals(&instance1.Key) { + // same instance... + return false + } + return instance0.MasterKey.Equals(&instance1.MasterKey) +} + +// InstanceIsMasterOf checks whether an instance is the master of another +func InstanceIsMasterOf(allegedMaster, allegedReplica *Instance) bool { + if !allegedReplica.IsReplica() { + return false + } + if allegedMaster.Key.Equals(&allegedReplica.Key) { + // same instance... + return false + } + return allegedMaster.Key.Equals(&allegedReplica.MasterKey) +} + +// MoveEquivalent will attempt moving instance indicated by instanceKey below another instance, +// based on known master coordinates equivalence +func MoveEquivalent(instanceKey, otherKey *InstanceKey) (*Instance, error) { + instance, found, err := ReadInstance(instanceKey) + if err != nil || !found { + return instance, err + } + if instance.Key.Equals(otherKey) { + return instance, fmt.Errorf("MoveEquivalent: attempt to move an instance below itself %+v", instance.Key) + } + + // Are there equivalent coordinates to this instance? + instanceCoordinates := &InstanceBinlogCoordinates{Key: instance.MasterKey, Coordinates: instance.ExecBinlogCoordinates} + binlogCoordinates, err := GetEquivalentBinlogCoordinatesFor(instanceCoordinates, otherKey) + if err != nil { + return instance, err + } + if binlogCoordinates == nil { + return instance, fmt.Errorf("No equivalent coordinates found for %+v replicating from %+v at %+v", instance.Key, instance.MasterKey, instance.ExecBinlogCoordinates) + } + // For performance reasons, we did all the above before even checking the replica is stopped or stopping it at all. + // This allows us to quickly skip the entire operation should there NOT be coordinates. + // To elaborate: if the replica is actually running AND making progress, it is unlikely/impossible for it to have + // equivalent coordinates, as the current coordinates are like to have never been seen. + // This excludes the case, for example, that the master is itself not replicating. + // Now if we DO get to happen on equivalent coordinates, we need to double check. For CHANGE MASTER to happen we must + // stop the replica anyhow. But then let's verify the position hasn't changed. + knownExecBinlogCoordinates := instance.ExecBinlogCoordinates + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + if !instance.ExecBinlogCoordinates.Equals(&knownExecBinlogCoordinates) { + // Seems like things were still running... We don't have an equivalence point + err = fmt.Errorf("MoveEquivalent(): ExecBinlogCoordinates changed after stopping replication on %+v; aborting", instance.Key) + goto Cleanup + } + instance, err = ChangeMasterTo(instanceKey, otherKey, binlogCoordinates, false, GTIDHintNeutral) + +Cleanup: + instance, _ = StartReplication(instanceKey) + + if err == nil { + message := fmt.Sprintf("moved %+v via equivalence coordinates below %+v", *instanceKey, *otherKey) + log.Debugf(message) + AuditOperation("move-equivalent", instanceKey, message) + } + return instance, err +} + +// MoveUp will attempt moving instance indicated by instanceKey up the topology hierarchy. +// It will perform all safety and sanity checks and will tamper with this instance's replication +// as well as its master. +func MoveUp(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + rinstance, _, _ := ReadInstance(&instance.Key) + if canMove, merr := rinstance.CanMove(); !canMove { + return instance, merr + } + master, err := GetInstanceMaster(instance) + if err != nil { + return instance, log.Errorf("Cannot GetInstanceMaster() for %+v. error=%+v", instance.Key, err) + } + + if !master.IsReplica() { + return instance, fmt.Errorf("master is not a replica itself: %+v", master.Key) + } + + if canReplicate, err := instance.CanReplicateFrom(master); canReplicate == false { + return instance, err + } + if master.IsBinlogServer() { + // Quick solution via binlog servers + return Repoint(instanceKey, &master.MasterKey, GTIDHintDeny) + } + + log.Infof("Will move %+v up the topology", *instanceKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "move up"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + if maintenanceToken, merr := BeginMaintenance(&master.Key, GetMaintenanceOwner(), fmt.Sprintf("child %+v moves up", *instanceKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", master.Key, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + if !instance.UsingMariaDBGTID { + master, err = StopReplication(&master.Key) + if err != nil { + goto Cleanup + } + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + if !instance.UsingMariaDBGTID { + instance, err = StartReplicationUntilMasterCoordinates(instanceKey, &master.SelfBinlogCoordinates) + if err != nil { + goto Cleanup + } + } + + // We can skip hostname unresolve; we just copy+paste whatever our master thinks of its master. + instance, err = ChangeMasterTo(instanceKey, &master.MasterKey, &master.ExecBinlogCoordinates, true, GTIDHintDeny) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + if !instance.UsingMariaDBGTID { + master, _ = StartReplication(&master.Key) + } + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("move-up", instanceKey, fmt.Sprintf("moved up %+v. Previous master: %+v", *instanceKey, master.Key)) + + return instance, err +} + +// MoveUpReplicas will attempt moving up all replicas of a given instance, at the same time. +// Clock-time, this is fater than moving one at a time. However this means all replicas of the given instance, and the instance itself, +// will all stop replicating together. +func MoveUpReplicas(instanceKey *InstanceKey, pattern string) ([](*Instance), *Instance, error, []error) { + res := [](*Instance){} + errs := []error{} + replicaMutex := make(chan bool, 1) + var barrier chan *InstanceKey + + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return res, nil, err, errs + } + if !instance.IsReplica() { + return res, instance, fmt.Errorf("instance is not a replica: %+v", instanceKey), errs + } + _, err = GetInstanceMaster(instance) + if err != nil { + return res, instance, log.Errorf("Cannot GetInstanceMaster() for %+v. error=%+v", instance.Key, err), errs + } + + if instance.IsBinlogServer() { + replicas, err, errors := RepointReplicasTo(instanceKey, pattern, &instance.MasterKey) + // Bail out! + return replicas, instance, err, errors + } + + replicas, err := ReadReplicaInstances(instanceKey) + if err != nil { + return res, instance, err, errs + } + replicas = filterInstancesByPattern(replicas, pattern) + if len(replicas) == 0 { + return res, instance, nil, errs + } + log.Infof("Will move replicas of %+v up the topology", *instanceKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "move up replicas"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + for _, replica := range replicas { + if maintenanceToken, merr := BeginMaintenance(&replica.Key, GetMaintenanceOwner(), fmt.Sprintf("%+v moves up", replica.Key)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", replica.Key, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + barrier = make(chan *InstanceKey) + for _, replica := range replicas { + replica := replica + go func() { + defer func() { + defer func() { barrier <- &replica.Key }() + StartReplication(&replica.Key) + }() + + var replicaErr error + ExecuteOnTopology(func() { + if canReplicate, err := replica.CanReplicateFrom(instance); canReplicate == false || err != nil { + replicaErr = err + return + } + if instance.IsBinlogServer() { + // Special case. Just repoint + replica, err = Repoint(&replica.Key, instanceKey, GTIDHintDeny) + if err != nil { + replicaErr = err + return + } + } else { + // Normal case. Do the math. + replica, err = StopReplication(&replica.Key) + if err != nil { + replicaErr = err + return + } + replica, err = StartReplicationUntilMasterCoordinates(&replica.Key, &instance.SelfBinlogCoordinates) + if err != nil { + replicaErr = err + return + } + + replica, err = ChangeMasterTo(&replica.Key, &instance.MasterKey, &instance.ExecBinlogCoordinates, false, GTIDHintDeny) + if err != nil { + replicaErr = err + return + } + } + }) + + func() { + replicaMutex <- true + defer func() { <-replicaMutex }() + if replicaErr == nil { + res = append(res, replica) + } else { + errs = append(errs, replicaErr) + } + }() + }() + } + for range replicas { + <-barrier + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return res, instance, log.Errore(err), errs + } + if len(errs) == len(replicas) { + // All returned with error + return res, instance, log.Error("Error on all operations"), errs + } + AuditOperation("move-up-replicas", instanceKey, fmt.Sprintf("moved up %d/%d replicas of %+v. New master: %+v", len(res), len(replicas), *instanceKey, instance.MasterKey)) + + return res, instance, err, errs +} + +// MoveBelow will attempt moving instance indicated by instanceKey below its supposed sibling indicated by sinblingKey. +// It will perform all safety and sanity checks and will tamper with this instance's replication +// as well as its sibling. +func MoveBelow(instanceKey, siblingKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + sibling, err := ReadTopologyInstance(siblingKey) + if err != nil { + return instance, err + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, log.Errorf("MoveBelow: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + + if sibling.IsBinlogServer() { + // Binlog server has same coordinates as master + // Easy solution! + return Repoint(instanceKey, &sibling.Key, GTIDHintDeny) + } + + rinstance, _, _ := ReadInstance(&instance.Key) + if canMove, merr := rinstance.CanMove(); !canMove { + return instance, merr + } + + rinstance, _, _ = ReadInstance(&sibling.Key) + if canMove, merr := rinstance.CanMove(); !canMove { + return instance, merr + } + if !InstancesAreSiblings(instance, sibling) { + return instance, fmt.Errorf("instances are not siblings: %+v, %+v", *instanceKey, *siblingKey) + } + + if canReplicate, err := instance.CanReplicateFrom(sibling); !canReplicate { + return instance, err + } + log.Infof("Will move %+v below %+v", instanceKey, siblingKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("move below %+v", *siblingKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + if maintenanceToken, merr := BeginMaintenance(siblingKey, GetMaintenanceOwner(), fmt.Sprintf("%+v moves below this", *instanceKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *siblingKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + sibling, err = StopReplication(siblingKey) + if err != nil { + goto Cleanup + } + if instance.ExecBinlogCoordinates.SmallerThan(&sibling.ExecBinlogCoordinates) { + instance, err = StartReplicationUntilMasterCoordinates(instanceKey, &sibling.ExecBinlogCoordinates) + if err != nil { + goto Cleanup + } + } else if sibling.ExecBinlogCoordinates.SmallerThan(&instance.ExecBinlogCoordinates) { + sibling, err = StartReplicationUntilMasterCoordinates(siblingKey, &instance.ExecBinlogCoordinates) + if err != nil { + goto Cleanup + } + } + // At this point both siblings have executed exact same statements and are identical + + instance, err = ChangeMasterTo(instanceKey, &sibling.Key, &sibling.SelfBinlogCoordinates, false, GTIDHintDeny) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + sibling, _ = StartReplication(siblingKey) + + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("move-below", instanceKey, fmt.Sprintf("moved %+v below %+v", *instanceKey, *siblingKey)) + + return instance, err +} + +func canReplicateAssumingOracleGTID(instance, masterInstance *Instance) (canReplicate bool, err error) { + subtract, err := GTIDSubtract(&instance.Key, masterInstance.GtidPurged, instance.ExecutedGtidSet) + if err != nil { + return false, err + } + subtractGtidSet, err := NewOracleGtidSet(subtract) + if err != nil { + return false, err + } + return subtractGtidSet.IsEmpty(), nil +} + +func instancesAreGTIDAndCompatible(instance, otherInstance *Instance) (isOracleGTID bool, isMariaDBGTID, compatible bool) { + isOracleGTID = (instance.UsingOracleGTID && otherInstance.SupportsOracleGTID) + isMariaDBGTID = (instance.UsingMariaDBGTID && otherInstance.IsMariaDB()) + compatible = isOracleGTID || isMariaDBGTID + return isOracleGTID, isMariaDBGTID, compatible +} + +func CheckMoveViaGTID(instance, otherInstance *Instance) (err error) { + isOracleGTID, _, moveCompatible := instancesAreGTIDAndCompatible(instance, otherInstance) + if !moveCompatible { + return fmt.Errorf("Instances %+v, %+v not GTID compatible or not using GTID", instance.Key, otherInstance.Key) + } + if isOracleGTID { + canReplicate, err := canReplicateAssumingOracleGTID(instance, otherInstance) + if err != nil { + return err + } + if !canReplicate { + return fmt.Errorf("Instance %+v has purged GTID entries not found on %+v", otherInstance.Key, instance.Key) + } + } + + return nil +} + +// moveInstanceBelowViaGTID will attempt moving given instance below another instance using either Oracle GTID or MariaDB GTID. +func moveInstanceBelowViaGTID(instance, otherInstance *Instance) (*Instance, error) { + rinstance, _, _ := ReadInstance(&instance.Key) + if canMove, merr := rinstance.CanMoveViaMatch(); !canMove { + return instance, merr + } + + if canReplicate, err := instance.CanReplicateFrom(otherInstance); !canReplicate { + return instance, err + } + if err := CheckMoveViaGTID(instance, otherInstance); err != nil { + return instance, err + } + log.Infof("Will move %+v below %+v via GTID", instance.Key, otherInstance.Key) + + instanceKey := &instance.Key + otherInstanceKey := &otherInstance.Key + + var err error + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("move below %+v", *otherInstanceKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + instance, err = ChangeMasterTo(instanceKey, &otherInstance.Key, &otherInstance.SelfBinlogCoordinates, false, GTIDHintForce) + if err != nil { + goto Cleanup + } +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("move-below-gtid", instanceKey, fmt.Sprintf("moved %+v below %+v", *instanceKey, *otherInstanceKey)) + + return instance, err +} + +// MoveBelowGTID will attempt moving instance indicated by instanceKey below another instance using either Oracle GTID or MariaDB GTID. +func MoveBelowGTID(instanceKey, otherKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + other, err := ReadTopologyInstance(otherKey) + if err != nil { + return instance, err + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, log.Errorf("MoveBelowGTID: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + return moveInstanceBelowViaGTID(instance, other) +} + +// moveReplicasViaGTID moves a list of replicas under another instance via GTID, returning those replicas +// that could not be moved (do not use GTID or had GTID errors) +func moveReplicasViaGTID(replicas [](*Instance), other *Instance, postponedFunctionsContainer *PostponedFunctionsContainer) (movedReplicas [](*Instance), unmovedReplicas [](*Instance), err error, errs []error) { + replicas = RemoveNilInstances(replicas) + replicas = RemoveInstance(replicas, &other.Key) + if len(replicas) == 0 { + // Nothing to do + return movedReplicas, unmovedReplicas, nil, errs + } + + log.Infof("moveReplicasViaGTID: Will move %+v replicas below %+v via GTID, max concurrency: %v", + len(replicas), + other.Key, + config.Config.MaxConcurrentReplicaOperations) + + var waitGroup sync.WaitGroup + var replicaMutex sync.Mutex + + var concurrencyChan = make(chan bool, config.Config.MaxConcurrentReplicaOperations) + + for _, replica := range replicas { + replica := replica + + waitGroup.Add(1) + // Parallelize repoints + go func() { + defer waitGroup.Done() + moveFunc := func() error { + + concurrencyChan <- true + defer func() { recover(); <-concurrencyChan }() + + movedReplica, replicaErr := moveInstanceBelowViaGTID(replica, other) + if replicaErr != nil && movedReplica != nil { + replica = movedReplica + } + + // After having moved replicas, update local shared variables: + replicaMutex.Lock() + defer replicaMutex.Unlock() + + if replicaErr == nil { + movedReplicas = append(movedReplicas, replica) + } else { + unmovedReplicas = append(unmovedReplicas, replica) + errs = append(errs, replicaErr) + } + return replicaErr + } + if shouldPostponeRelocatingReplica(replica, postponedFunctionsContainer) { + postponedFunctionsContainer.AddPostponedFunction(moveFunc, fmt.Sprintf("move-replicas-gtid %+v", replica.Key)) + // We bail out and trust our invoker to later call upon this postponed function + } else { + ExecuteOnTopology(func() { moveFunc() }) + } + }() + } + waitGroup.Wait() + + if len(errs) == len(replicas) { + // All returned with error + return movedReplicas, unmovedReplicas, fmt.Errorf("moveReplicasViaGTID: Error on all %+v operations", len(errs)), errs + } + AuditOperation("move-replicas-gtid", &other.Key, fmt.Sprintf("moved %d/%d replicas below %+v via GTID", len(movedReplicas), len(replicas), other.Key)) + + return movedReplicas, unmovedReplicas, err, errs +} + +// MoveReplicasGTID will (attempt to) move all replicas of given master below given instance. +func MoveReplicasGTID(masterKey *InstanceKey, belowKey *InstanceKey, pattern string) (movedReplicas [](*Instance), unmovedReplicas [](*Instance), err error, errs []error) { + belowInstance, err := ReadTopologyInstance(belowKey) + if err != nil { + // Can't access "below" ==> can't move replicas beneath it + return movedReplicas, unmovedReplicas, err, errs + } + + // replicas involved + replicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(masterKey) + if err != nil { + return movedReplicas, unmovedReplicas, err, errs + } + replicas = filterInstancesByPattern(replicas, pattern) + movedReplicas, unmovedReplicas, err, errs = moveReplicasViaGTID(replicas, belowInstance, nil) + if err != nil { + log.Errore(err) + } + + if len(unmovedReplicas) > 0 { + err = fmt.Errorf("MoveReplicasGTID: only moved %d out of %d replicas of %+v; error is: %+v", len(movedReplicas), len(replicas), *masterKey, err) + } + + return movedReplicas, unmovedReplicas, err, errs +} + +// Repoint connects a replica to a master using its exact same executing coordinates. +// The given masterKey can be null, in which case the existing master is used. +// Two use cases: +// - masterKey is nil: use case is corrupted relay logs on replica +// - masterKey is not nil: using Binlog servers (coordinates remain the same) +func Repoint(instanceKey *InstanceKey, masterKey *InstanceKey, gtidHint OperationGTIDHint) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", *instanceKey) + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, fmt.Errorf("repoint: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + if masterKey == nil { + masterKey = &instance.MasterKey + } + // With repoint we *prefer* the master to be alive, but we don't strictly require it. + // The use case for the master being alive is with hostname-resolve or hostname-unresolve: asking the replica + // to reconnect to its same master while changing the MASTER_HOST in CHANGE MASTER TO due to DNS changes etc. + master, err := ReadTopologyInstance(masterKey) + masterIsAccessible := (err == nil) + if !masterIsAccessible { + master, _, err = ReadInstance(masterKey) + if master == nil || err != nil { + return instance, err + } + } + if canReplicate, err := instance.CanReplicateFrom(master); !canReplicate { + return instance, err + } + + // if a binlog server check it is sufficiently up to date + if master.IsBinlogServer() { + // "Repoint" operation trusts the user. But only so much. Repoiting to a binlog server which is not yet there is strictly wrong. + if !instance.ExecBinlogCoordinates.SmallerThanOrEquals(&master.SelfBinlogCoordinates) { + return instance, fmt.Errorf("repoint: binlog server %+v is not sufficiently up to date to repoint %+v below it", *masterKey, *instanceKey) + } + } + + log.Infof("Will repoint %+v to master %+v", *instanceKey, *masterKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "repoint"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + // See above, we are relaxed about the master being accessible/inaccessible. + // If accessible, we wish to do hostname-unresolve. If inaccessible, we can skip the test and not fail the + // ChangeMasterTo operation. This is why we pass "!masterIsAccessible" below. + if instance.ExecBinlogCoordinates.IsEmpty() { + instance.ExecBinlogCoordinates.LogFile = "orchestrator-unknown-log-file" + } + instance, err = ChangeMasterTo(instanceKey, masterKey, &instance.ExecBinlogCoordinates, !masterIsAccessible, gtidHint) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("repoint", instanceKey, fmt.Sprintf("replica %+v repointed to master: %+v", *instanceKey, *masterKey)) + + return instance, err + +} + +// RepointTo repoints list of replicas onto another master. +// Binlog Server is the major use case +func RepointTo(replicas [](*Instance), belowKey *InstanceKey) ([](*Instance), error, []error) { + res := [](*Instance){} + errs := []error{} + + replicas = RemoveInstance(replicas, belowKey) + if len(replicas) == 0 { + // Nothing to do + return res, nil, errs + } + if belowKey == nil { + return res, log.Errorf("RepointTo received nil belowKey"), errs + } + + log.Infof("Will repoint %+v replicas below %+v", len(replicas), *belowKey) + barrier := make(chan *InstanceKey) + replicaMutex := make(chan bool, 1) + for _, replica := range replicas { + replica := replica + + // Parallelize repoints + go func() { + defer func() { barrier <- &replica.Key }() + ExecuteOnTopology(func() { + replica, replicaErr := Repoint(&replica.Key, belowKey, GTIDHintNeutral) + + func() { + // Instantaneous mutex. + replicaMutex <- true + defer func() { <-replicaMutex }() + if replicaErr == nil { + res = append(res, replica) + } else { + errs = append(errs, replicaErr) + } + }() + }) + }() + } + for range replicas { + <-barrier + } + + if len(errs) == len(replicas) { + // All returned with error + return res, log.Error("Error on all operations"), errs + } + AuditOperation("repoint-to", belowKey, fmt.Sprintf("repointed %d/%d replicas to %+v", len(res), len(replicas), *belowKey)) + + return res, nil, errs +} + +// RepointReplicasTo repoints replicas of a given instance (possibly filtered) onto another master. +// Binlog Server is the major use case +func RepointReplicasTo(instanceKey *InstanceKey, pattern string, belowKey *InstanceKey) ([](*Instance), error, []error) { + res := [](*Instance){} + errs := []error{} + + replicas, err := ReadReplicaInstances(instanceKey) + if err != nil { + return res, err, errs + } + replicas = RemoveInstance(replicas, belowKey) + replicas = filterInstancesByPattern(replicas, pattern) + if len(replicas) == 0 { + // Nothing to do + return res, nil, errs + } + if belowKey == nil { + // Default to existing master. All replicas are of the same master, hence just pick one. + belowKey = &replicas[0].MasterKey + } + log.Infof("Will repoint replicas of %+v to %+v", *instanceKey, *belowKey) + return RepointTo(replicas, belowKey) +} + +// RepointReplicas repoints all replicas of a given instance onto its existing master. +func RepointReplicas(instanceKey *InstanceKey, pattern string) ([](*Instance), error, []error) { + return RepointReplicasTo(instanceKey, pattern, nil) +} + +// MakeCoMaster will attempt to make an instance co-master with its master, by making its master a replica of its own. +// This only works out if the master is not replicating; the master does not have a known master (it may have an unknown master). +func MakeCoMaster(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if canMove, merr := instance.CanMove(); !canMove { + return instance, merr + } + master, err := GetInstanceMaster(instance) + if err != nil { + return instance, err + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, fmt.Errorf("MakeCoMaster: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + log.Debugf("Will check whether %+v's master (%+v) can become its co-master", instance.Key, master.Key) + if canMove, merr := master.CanMoveAsCoMaster(); !canMove { + return instance, merr + } + if instanceKey.Equals(&master.MasterKey) { + return instance, fmt.Errorf("instance %+v is already co master of %+v", instance.Key, master.Key) + } + if !instance.ReadOnly { + return instance, fmt.Errorf("instance %+v is not read-only; first make it read-only before making it co-master", instance.Key) + } + if master.IsCoMaster { + // We allow breaking of an existing co-master replication. Here's the breakdown: + // Ideally, this would not eb allowed, and we would first require the user to RESET SLAVE on 'master' + // prior to making it participate as co-master with our 'instance'. + // However there's the problem that upon RESET SLAVE we lose the replication's user/password info. + // Thus, we come up with the following rule: + // If S replicates from M1, and M1<->M2 are co masters, we allow S to become co-master of M1 (S<->M1) if: + // - M1 is writeable + // - M2 is read-only or is unreachable/invalid + // - S is read-only + // And so we will be replacing one read-only co-master with another. + otherCoMaster, found, _ := ReadInstance(&master.MasterKey) + if found && otherCoMaster.IsLastCheckValid && !otherCoMaster.ReadOnly { + return instance, fmt.Errorf("master %+v is already co-master with %+v, and %+v is alive, and not read-only; cowardly refusing to demote it. Please set it as read-only beforehand", master.Key, otherCoMaster.Key, otherCoMaster.Key) + } + // OK, good to go. + } else if _, found, _ := ReadInstance(&master.MasterKey); found { + return instance, fmt.Errorf("%+v is not a real master; it replicates from: %+v", master.Key, master.MasterKey) + } + if canReplicate, err := master.CanReplicateFrom(instance); !canReplicate { + return instance, err + } + log.Infof("Will make %+v co-master of %+v", instanceKey, master.Key) + + var gitHint OperationGTIDHint = GTIDHintNeutral + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("make co-master of %+v", master.Key)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + if maintenanceToken, merr := BeginMaintenance(&master.Key, GetMaintenanceOwner(), fmt.Sprintf("%+v turns into co-master of this", *instanceKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", master.Key, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + // the coMaster used to be merely a replica. Just point master into *some* position + // within coMaster... + if master.IsReplica() { + // this is the case of a co-master. For masters, the StopReplication operation throws an error, and + // there's really no point in doing it. + master, err = StopReplication(&master.Key) + if err != nil { + goto Cleanup + } + } + if !master.HasReplicationCredentials { + // Let's try , if possible, to get credentials from replica. Best effort. + if replicationUser, replicationPassword, credentialsErr := ReadReplicationCredentials(&instance.Key); credentialsErr == nil { + log.Debugf("Got credentials from a replica. will now apply") + _, err = ChangeMasterCredentials(&master.Key, replicationUser, replicationPassword) + if err != nil { + goto Cleanup + } + } + } + + if instance.AllowTLS { + log.Debugf("Enabling SSL replication") + _, err = EnableMasterSSL(&master.Key) + if err != nil { + goto Cleanup + } + } + + if instance.UsingOracleGTID { + gitHint = GTIDHintForce + } + master, err = ChangeMasterTo(&master.Key, instanceKey, &instance.SelfBinlogCoordinates, false, gitHint) + if err != nil { + goto Cleanup + } + +Cleanup: + master, _ = StartReplication(&master.Key) + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("make-co-master", instanceKey, fmt.Sprintf("%+v made co-master of %+v", *instanceKey, master.Key)) + + return instance, err +} + +// ResetReplicationOperation will reset a replica +func ResetReplicationOperation(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + + log.Infof("Will reset replica on %+v", instanceKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "reset replica"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + if instance.IsReplica() { + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + } + + instance, err = ResetReplication(instanceKey) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + + if err != nil { + return instance, log.Errore(err) + } + + // and we're done (pending deferred functions) + AuditOperation("reset-slave", instanceKey, fmt.Sprintf("%+v replication reset", *instanceKey)) + + return instance, err +} + +// DetachReplicaMasterHost detaches a replica from its master by corrupting the Master_Host (in such way that is reversible) +func DetachReplicaMasterHost(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", *instanceKey) + } + if instance.MasterKey.IsDetached() { + return instance, fmt.Errorf("instance already detached: %+v", *instanceKey) + } + detachedMasterKey := instance.MasterKey.DetachedKey() + + log.Infof("Will detach master host on %+v. Detached key is %+v", *instanceKey, *detachedMasterKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "detach-replica-master-host"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + instance, err = ChangeMasterTo(instanceKey, detachedMasterKey, &instance.ExecBinlogCoordinates, true, GTIDHintNeutral) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("repoint", instanceKey, fmt.Sprintf("replica %+v detached from master into %+v", *instanceKey, *detachedMasterKey)) + + return instance, err +} + +// ReattachReplicaMasterHost reattaches a replica back onto its master by undoing a DetachReplicaMasterHost operation +func ReattachReplicaMasterHost(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", *instanceKey) + } + if !instance.MasterKey.IsDetached() { + return instance, fmt.Errorf("instance does not seem to be detached: %+v", *instanceKey) + } + + reattachedMasterKey := instance.MasterKey.ReattachedKey() + + log.Infof("Will reattach master host on %+v. Reattached key is %+v", *instanceKey, *reattachedMasterKey) + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "reattach-replica-master-host"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + instance, err = ChangeMasterTo(instanceKey, reattachedMasterKey, &instance.ExecBinlogCoordinates, true, GTIDHintNeutral) + if err != nil { + goto Cleanup + } + // Just in case this instance used to be a master: + ReplaceAliasClusterName(instanceKey.StringCode(), reattachedMasterKey.StringCode()) + +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("repoint", instanceKey, fmt.Sprintf("replica %+v reattached to master %+v", *instanceKey, *reattachedMasterKey)) + + return instance, err +} + +// EnableGTID will attempt to enable GTID-mode (either Oracle or MariaDB) +func EnableGTID(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if instance.UsingGTID() { + return instance, fmt.Errorf("%+v already uses GTID", *instanceKey) + } + + log.Infof("Will attempt to enable GTID on %+v", *instanceKey) + + instance, err = Repoint(instanceKey, nil, GTIDHintForce) + if err != nil { + return instance, err + } + if !instance.UsingGTID() { + return instance, fmt.Errorf("Cannot enable GTID on %+v", *instanceKey) + } + + AuditOperation("enable-gtid", instanceKey, fmt.Sprintf("enabled GTID on %+v", *instanceKey)) + + return instance, err +} + +// DisableGTID will attempt to disable GTID-mode (either Oracle or MariaDB) and revert to binlog file:pos replication +func DisableGTID(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if !instance.UsingGTID() { + return instance, fmt.Errorf("%+v is not using GTID", *instanceKey) + } + + log.Infof("Will attempt to disable GTID on %+v", *instanceKey) + + instance, err = Repoint(instanceKey, nil, GTIDHintDeny) + if err != nil { + return instance, err + } + if instance.UsingGTID() { + return instance, fmt.Errorf("Cannot disable GTID on %+v", *instanceKey) + } + + AuditOperation("disable-gtid", instanceKey, fmt.Sprintf("disabled GTID on %+v", *instanceKey)) + + return instance, err +} + +func LocateErrantGTID(instanceKey *InstanceKey) (errantBinlogs []string, err error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return errantBinlogs, err + } + errantSearch := instance.GtidErrant + if errantSearch == "" { + return errantBinlogs, log.Errorf("locate-errant-gtid: no errant-gtid on %+v", *instanceKey) + } + subtract, err := GTIDSubtract(instanceKey, errantSearch, instance.GtidPurged) + if err != nil { + return errantBinlogs, err + } + if subtract != errantSearch { + return errantBinlogs, fmt.Errorf("locate-errant-gtid: %+v is already purged on %+v", subtract, *instanceKey) + } + binlogs, err := ShowBinaryLogs(instanceKey) + if err != nil { + return errantBinlogs, err + } + previousGTIDs := make(map[string]*OracleGtidSet) + for _, binlog := range binlogs { + oracleGTIDSet, err := GetPreviousGTIDs(instanceKey, binlog) + if err != nil { + return errantBinlogs, err + } + previousGTIDs[binlog] = oracleGTIDSet + } + for i, binlog := range binlogs { + if errantSearch == "" { + break + } + previousGTID := previousGTIDs[binlog] + subtract, err := GTIDSubtract(instanceKey, errantSearch, previousGTID.String()) + if err != nil { + return errantBinlogs, err + } + if subtract != errantSearch { + // binlogs[i-1] is safe to use when i==0. because that implies GTIDs have been purged, + // which covered by an earlier assertion + errantBinlogs = append(errantBinlogs, binlogs[i-1]) + errantSearch = subtract + } + } + if errantSearch != "" { + // then it's in the last binary log + errantBinlogs = append(errantBinlogs, binlogs[len(binlogs)-1]) + } + return errantBinlogs, err +} + +// ErrantGTIDResetMaster will issue a safe RESET MASTER on a replica that replicates via GTID: +// It will make sure the gtid_purged set matches the executed set value as read just before the RESET. +// this will enable new replicas to be attached to given instance without complaints about missing/purged entries. +// This function requires that the instance does not have replicas. +func ErrantGTIDResetMaster(instanceKey *InstanceKey) (instance *Instance, err error) { + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if instance.GtidErrant == "" { + return instance, log.Errorf("gtid-errant-reset-master will not operate on %+v because no errant GTID is found", *instanceKey) + } + if !instance.SupportsOracleGTID { + return instance, log.Errorf("gtid-errant-reset-master requested for %+v but it is not using oracle-gtid", *instanceKey) + } + if len(instance.Replicas) > 0 { + return instance, log.Errorf("gtid-errant-reset-master will not operate on %+v because it has %+v replicas. Expecting no replicas", *instanceKey, len(instance.Replicas)) + } + + gtidSubtract := "" + executedGtidSet := "" + masterStatusFound := false + replicationStopped := false + waitInterval := time.Second * 5 + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), "reset-master-gtid"); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + if instance.IsReplica() { + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + replicationStopped, err = waitForReplicationState(instanceKey, ReplicationThreadStateStopped) + if err != nil { + goto Cleanup + } + if !replicationStopped { + err = fmt.Errorf("gtid-errant-reset-master: timeout while waiting for replication to stop on %+v", instance.Key) + goto Cleanup + } + } + + gtidSubtract, err = GTIDSubtract(instanceKey, instance.ExecutedGtidSet, instance.GtidErrant) + if err != nil { + goto Cleanup + } + + // We're about to perform a destructive operation. It is non transactional and cannot be rolled back. + // The replica will be left in a broken state. + // This is why we allow multiple attempts at the following: + for i := 0; i < countRetries; i++ { + instance, err = ResetMaster(instanceKey) + if err == nil { + break + } + time.Sleep(waitInterval) + } + if err != nil { + err = fmt.Errorf("gtid-errant-reset-master: error while resetting master on %+v, after which intended to set gtid_purged to: %s. Error was: %+v", instance.Key, gtidSubtract, err) + goto Cleanup + } + + masterStatusFound, executedGtidSet, err = ShowMasterStatus(instanceKey) + if err != nil { + err = fmt.Errorf("gtid-errant-reset-master: error getting master status on %+v, after which intended to set gtid_purged to: %s. Error was: %+v", instance.Key, gtidSubtract, err) + goto Cleanup + } + if !masterStatusFound { + err = fmt.Errorf("gtid-errant-reset-master: cannot get master status on %+v, after which intended to set gtid_purged to: %s.", instance.Key, gtidSubtract) + goto Cleanup + } + if executedGtidSet != "" { + err = fmt.Errorf("gtid-errant-reset-master: Unexpected non-empty Executed_Gtid_Set found on %+v following RESET MASTER, after which intended to set gtid_purged to: %s. Executed_Gtid_Set found to be: %+v", instance.Key, gtidSubtract, executedGtidSet) + goto Cleanup + } + + // We've just made the destructive operation. Again, allow for retries: + for i := 0; i < countRetries; i++ { + err = setGTIDPurged(instance, gtidSubtract) + if err == nil { + break + } + time.Sleep(waitInterval) + } + if err != nil { + err = fmt.Errorf("gtid-errant-reset-master: error setting gtid_purged on %+v to: %s. Error was: %+v", instance.Key, gtidSubtract, err) + goto Cleanup + } + +Cleanup: + var startReplicationErr error + instance, startReplicationErr = StartReplication(instanceKey) + log.Errore(startReplicationErr) + + if err != nil { + return instance, log.Errore(err) + } + + // and we're done (pending deferred functions) + AuditOperation("gtid-errant-reset-master", instanceKey, fmt.Sprintf("%+v master reset", *instanceKey)) + + return instance, err +} + +// ErrantGTIDInjectEmpty will inject an empty transaction on the master of an instance's cluster in order to get rid +// of an errant transaction observed on the instance. +func ErrantGTIDInjectEmpty(instanceKey *InstanceKey) (instance *Instance, clusterMaster *Instance, countInjectedTransactions int64, err error) { + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, clusterMaster, countInjectedTransactions, err + } + if instance.GtidErrant == "" { + return instance, clusterMaster, countInjectedTransactions, log.Errorf("gtid-errant-inject-empty will not operate on %+v because no errant GTID is found", *instanceKey) + } + if !instance.SupportsOracleGTID { + return instance, clusterMaster, countInjectedTransactions, log.Errorf("gtid-errant-inject-empty requested for %+v but it does not support oracle-gtid", *instanceKey) + } + + masters, err := ReadClusterWriteableMaster(instance.ClusterName) + if err != nil { + return instance, clusterMaster, countInjectedTransactions, err + } + if len(masters) == 0 { + return instance, clusterMaster, countInjectedTransactions, log.Errorf("gtid-errant-inject-empty found no writabel master for %+v cluster", instance.ClusterName) + } + clusterMaster = masters[0] + + if !clusterMaster.SupportsOracleGTID { + return instance, clusterMaster, countInjectedTransactions, log.Errorf("gtid-errant-inject-empty requested for %+v but the cluster's master %+v does not support oracle-gtid", *instanceKey, clusterMaster.Key) + } + + gtidSet, err := NewOracleGtidSet(instance.GtidErrant) + if err != nil { + return instance, clusterMaster, countInjectedTransactions, err + } + explodedEntries := gtidSet.Explode() + log.Infof("gtid-errant-inject-empty: about to inject %+v empty transactions %+v on cluster master %+v", len(explodedEntries), gtidSet.String(), clusterMaster.Key) + for _, entry := range explodedEntries { + if err := injectEmptyGTIDTransaction(&clusterMaster.Key, entry); err != nil { + return instance, clusterMaster, countInjectedTransactions, err + } + countInjectedTransactions++ + } + + // and we're done (pending deferred functions) + AuditOperation("gtid-errant-inject-empty", instanceKey, fmt.Sprintf("injected %+v empty transactions on %+v", countInjectedTransactions, clusterMaster.Key)) + + return instance, clusterMaster, countInjectedTransactions, err +} + +// FindLastPseudoGTIDEntry will search an instance's binary logs or relay logs for the last pseudo-GTID entry, +// and return found coordinates as well as entry text +func FindLastPseudoGTIDEntry(instance *Instance, recordedInstanceRelayLogCoordinates BinlogCoordinates, maxBinlogCoordinates *BinlogCoordinates, exhaustiveSearch bool, expectedBinlogFormat *string) (instancePseudoGtidCoordinates *BinlogCoordinates, instancePseudoGtidText string, err error) { + + if config.Config.PseudoGTIDPattern == "" { + return instancePseudoGtidCoordinates, instancePseudoGtidText, fmt.Errorf("PseudoGTIDPattern not configured; cannot use Pseudo-GTID") + } + + if instance.LogBinEnabled && instance.LogReplicationUpdatesEnabled && !*config.RuntimeCLIFlags.SkipBinlogSearch && (expectedBinlogFormat == nil || instance.Binlog_format == *expectedBinlogFormat) { + minBinlogCoordinates, _, _ := GetHeuristiclyRecentCoordinatesForInstance(&instance.Key) + // Well no need to search this instance's binary logs if it doesn't have any... + // With regard log-slave-updates, some edge cases are possible, like having this instance's log-slave-updates + // enabled/disabled (of course having restarted it) + // The approach is not to take chances. If log-slave-updates is disabled, fail and go for relay-logs. + // If log-slave-updates was just enabled then possibly no pseudo-gtid is found, and so again we will go + // for relay logs. + // Also, if master has STATEMENT binlog format, and the replica has ROW binlog format, then comparing binlog entries would urely fail if based on the replica's binary logs. + // Instead, we revert to the relay logs. + instancePseudoGtidCoordinates, instancePseudoGtidText, err = getLastPseudoGTIDEntryInInstance(instance, minBinlogCoordinates, maxBinlogCoordinates, exhaustiveSearch) + } + if err != nil || instancePseudoGtidCoordinates == nil { + minRelaylogCoordinates, _ := GetPreviousKnownRelayLogCoordinatesForInstance(instance) + // Unable to find pseudo GTID in binary logs. + // Then MAYBE we are lucky enough (chances are we are, if this replica did not crash) that we can + // extract the Pseudo GTID entry from the last (current) relay log file. + instancePseudoGtidCoordinates, instancePseudoGtidText, err = getLastPseudoGTIDEntryInRelayLogs(instance, minRelaylogCoordinates, recordedInstanceRelayLogCoordinates, exhaustiveSearch) + } + return instancePseudoGtidCoordinates, instancePseudoGtidText, err +} + +// CorrelateBinlogCoordinates find out, if possible, the binlog coordinates of given otherInstance that correlate +// with given coordinates of given instance. +func CorrelateBinlogCoordinates(instance *Instance, binlogCoordinates *BinlogCoordinates, otherInstance *Instance) (*BinlogCoordinates, int, error) { + // We record the relay log coordinates just after the instance stopped since the coordinates can change upon + // a FLUSH LOGS/FLUSH RELAY LOGS (or a START SLAVE, though that's an altogether different problem) etc. + // We want to be on the safe side; we don't utterly trust that we are the only ones playing with the instance. + recordedInstanceRelayLogCoordinates := instance.RelaylogCoordinates + instancePseudoGtidCoordinates, instancePseudoGtidText, err := FindLastPseudoGTIDEntry(instance, recordedInstanceRelayLogCoordinates, binlogCoordinates, true, &otherInstance.Binlog_format) + + if err != nil { + return nil, 0, err + } + entriesMonotonic := (config.Config.PseudoGTIDMonotonicHint != "") && strings.Contains(instancePseudoGtidText, config.Config.PseudoGTIDMonotonicHint) + minBinlogCoordinates, _, err := GetHeuristiclyRecentCoordinatesForInstance(&otherInstance.Key) + otherInstancePseudoGtidCoordinates, err := SearchEntryInInstanceBinlogs(otherInstance, instancePseudoGtidText, entriesMonotonic, minBinlogCoordinates) + if err != nil { + return nil, 0, err + } + + // We've found a match: the latest Pseudo GTID position within instance and its identical twin in otherInstance + // We now iterate the events in both, up to the completion of events in instance (recall that we looked for + // the last entry in instance, hence, assuming pseudo GTID entries are frequent, the amount of entries to read + // from instance is not long) + // The result of the iteration will be either: + // - bad conclusion that instance is actually more advanced than otherInstance (we find more entries in instance + // following the pseudo gtid than we can match in otherInstance), hence we cannot ask instance to replicate + // from otherInstance + // - good result: both instances are exactly in same shape (have replicated the exact same number of events since + // the last pseudo gtid). Since they are identical, it is easy to point instance into otherInstance. + // - good result: the first position within otherInstance where instance has not replicated yet. It is easy to point + // instance into otherInstance. + nextBinlogCoordinatesToMatch, countMatchedEvents, err := GetNextBinlogCoordinatesToMatch(instance, *instancePseudoGtidCoordinates, + recordedInstanceRelayLogCoordinates, binlogCoordinates, otherInstance, *otherInstancePseudoGtidCoordinates) + if err != nil { + return nil, 0, err + } + if countMatchedEvents == 0 { + err = fmt.Errorf("Unexpected: 0 events processed while iterating logs. Something went wrong; aborting. nextBinlogCoordinatesToMatch: %+v", nextBinlogCoordinatesToMatch) + return nil, 0, err + } + return nextBinlogCoordinatesToMatch, countMatchedEvents, nil +} + +func CorrelateRelaylogCoordinates(instance *Instance, relaylogCoordinates *BinlogCoordinates, otherInstance *Instance) (instanceCoordinates, correlatedCoordinates, nextCoordinates *BinlogCoordinates, found bool, err error) { + // The two servers are expected to have the same master, or this doesn't work + if !instance.MasterKey.Equals(&otherInstance.MasterKey) { + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, log.Errorf("CorrelateRelaylogCoordinates requires sibling instances, however %+v has master %+v, and %+v has master %+v", instance.Key, instance.MasterKey, otherInstance.Key, otherInstance.MasterKey) + } + var binlogEvent *BinlogEvent + if relaylogCoordinates == nil { + instanceCoordinates = &instance.RelaylogCoordinates + if minCoordinates, err := GetPreviousKnownRelayLogCoordinatesForInstance(instance); err != nil { + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, err + } else if binlogEvent, err = GetLastExecutedEntryInRelayLogs(instance, minCoordinates, instance.RelaylogCoordinates); err != nil { + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, err + } + } else { + instanceCoordinates = relaylogCoordinates + relaylogCoordinates.Type = RelayLog + if binlogEvent, err = ReadBinlogEventAtRelayLogCoordinates(&instance.Key, relaylogCoordinates); err != nil { + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, err + } + } + + _, minCoordinates, err := GetHeuristiclyRecentCoordinatesForInstance(&otherInstance.Key) + if err != nil { + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, err + } + correlatedCoordinates, nextCoordinates, found, err = SearchEventInRelayLogs(binlogEvent, otherInstance, minCoordinates, otherInstance.RelaylogCoordinates) + return instanceCoordinates, correlatedCoordinates, nextCoordinates, found, err +} + +// MatchBelow will attempt moving instance indicated by instanceKey below its the one indicated by otherKey. +// The refactoring is based on matching binlog entries, not on "classic" positions comparisons. +// The "other instance" could be the sibling of the moving instance any of its ancestors. It may actually be +// a cousin of some sort (though unlikely). The only important thing is that the "other instance" is more +// advanced in replication than given instance. +func MatchBelow(instanceKey, otherKey *InstanceKey, requireInstanceMaintenance bool) (*Instance, *BinlogCoordinates, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, nil, err + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, nil, fmt.Errorf("MatchBelow: %+v is a secondary replication group member, hence, it cannot be relocated", *instanceKey) + } + if config.Config.PseudoGTIDPattern == "" { + return instance, nil, fmt.Errorf("PseudoGTIDPattern not configured; cannot use Pseudo-GTID") + } + if instanceKey.Equals(otherKey) { + return instance, nil, fmt.Errorf("MatchBelow: attempt to match an instance below itself %+v", *instanceKey) + } + otherInstance, err := ReadTopologyInstance(otherKey) + if err != nil { + return instance, nil, err + } + + rinstance, _, _ := ReadInstance(&instance.Key) + if canMove, merr := rinstance.CanMoveViaMatch(); !canMove { + return instance, nil, merr + } + + if canReplicate, err := instance.CanReplicateFrom(otherInstance); !canReplicate { + return instance, nil, err + } + var nextBinlogCoordinatesToMatch *BinlogCoordinates + var countMatchedEvents int + + if otherInstance.IsBinlogServer() { + // A Binlog Server does not do all the SHOW BINLOG EVENTS stuff + err = fmt.Errorf("Cannot use PseudoGTID with Binlog Server %+v", otherInstance.Key) + goto Cleanup + } + + log.Infof("Will match %+v below %+v", *instanceKey, *otherKey) + + if requireInstanceMaintenance { + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("match below %+v", *otherKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + // We don't require grabbing maintenance lock on otherInstance, but we do request + // that it is not already under maintenance. + if inMaintenance, merr := InMaintenance(&otherInstance.Key); merr != nil { + err = merr + goto Cleanup + } else if inMaintenance { + err = fmt.Errorf("Cannot match below %+v; it is in maintenance", otherInstance.Key) + goto Cleanup + } + } + + log.Debugf("Stopping replica on %+v", *instanceKey) + instance, err = StopReplication(instanceKey) + if err != nil { + goto Cleanup + } + + nextBinlogCoordinatesToMatch, countMatchedEvents, err = CorrelateBinlogCoordinates(instance, nil, otherInstance) + + if countMatchedEvents == 0 { + err = fmt.Errorf("Unexpected: 0 events processed while iterating logs. Something went wrong; aborting. nextBinlogCoordinatesToMatch: %+v", nextBinlogCoordinatesToMatch) + goto Cleanup + } + log.Debugf("%+v will match below %+v at %+v; validated events: %d", *instanceKey, *otherKey, *nextBinlogCoordinatesToMatch, countMatchedEvents) + + // Drum roll... + instance, err = ChangeMasterTo(instanceKey, otherKey, nextBinlogCoordinatesToMatch, false, GTIDHintDeny) + if err != nil { + goto Cleanup + } + +Cleanup: + instance, _ = StartReplication(instanceKey) + if err != nil { + return instance, nextBinlogCoordinatesToMatch, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("match-below", instanceKey, fmt.Sprintf("matched %+v below %+v", *instanceKey, *otherKey)) + + return instance, nextBinlogCoordinatesToMatch, err +} + +// RematchReplica will re-match a replica to its master, using pseudo-gtid +func RematchReplica(instanceKey *InstanceKey, requireInstanceMaintenance bool) (*Instance, *BinlogCoordinates, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, nil, err + } + masterInstance, found, err := ReadInstance(&instance.MasterKey) + if err != nil || !found { + return instance, nil, err + } + return MatchBelow(instanceKey, &masterInstance.Key, requireInstanceMaintenance) +} + +// MakeMaster will take an instance, make all its siblings its replicas (via pseudo-GTID) and make it master +// (stop its replicaiton, make writeable). +func MakeMaster(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + masterInstance, err := ReadTopologyInstance(&instance.MasterKey) + if err == nil { + // If the read succeeded, check the master status. + if masterInstance.IsReplica() { + return instance, fmt.Errorf("MakeMaster: instance's master %+v seems to be replicating", masterInstance.Key) + } + if masterInstance.IsLastCheckValid { + return instance, fmt.Errorf("MakeMaster: instance's master %+v seems to be accessible", masterInstance.Key) + } + } + // Continue anyway if the read failed, because that means the master is + // inaccessible... So it's OK to do the promotion. + if !instance.SQLThreadUpToDate() { + return instance, fmt.Errorf("MakeMaster: instance's SQL thread must be up-to-date with I/O thread for %+v", *instanceKey) + } + siblings, err := ReadReplicaInstances(&masterInstance.Key) + if err != nil { + return instance, err + } + for _, sibling := range siblings { + if instance.ExecBinlogCoordinates.SmallerThan(&sibling.ExecBinlogCoordinates) { + return instance, fmt.Errorf("MakeMaster: instance %+v has more advanced sibling: %+v", *instanceKey, sibling.Key) + } + } + + if maintenanceToken, merr := BeginMaintenance(instanceKey, GetMaintenanceOwner(), fmt.Sprintf("siblings match below this: %+v", *instanceKey)); merr != nil { + err = fmt.Errorf("Cannot begin maintenance on %+v: %v", *instanceKey, merr) + goto Cleanup + } else { + defer EndMaintenance(maintenanceToken) + } + + _, _, err, _ = MultiMatchBelow(siblings, instanceKey, nil) + if err != nil { + goto Cleanup + } + + SetReadOnly(instanceKey, false) + +Cleanup: + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("make-master", instanceKey, fmt.Sprintf("made master of %+v", *instanceKey)) + + return instance, err +} + +// TakeSiblings is a convenience method for turning siblings of a replica to be its subordinates. +// This operation is a syntatctic sugar on top relocate-replicas, which uses any available means to the objective: +// GTID, Pseudo-GTID, binlog servers, standard replication... +func TakeSiblings(instanceKey *InstanceKey) (instance *Instance, takenSiblings int, err error) { + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, 0, err + } + if !instance.IsReplica() { + return instance, takenSiblings, log.Errorf("take-siblings: instance %+v is not a replica.", *instanceKey) + } + relocatedReplicas, _, err, _ := RelocateReplicas(&instance.MasterKey, instanceKey, "") + + return instance, len(relocatedReplicas), err +} + +// Created this function to allow a hook to be called after a successful TakeMaster event +func TakeMasterHook(successor *Instance, demoted *Instance) { + if demoted == nil { + return + } + if successor == nil { + return + } + successorKey := successor.Key + demotedKey := demoted.Key + env := goos.Environ() + + env = append(env, fmt.Sprintf("ORC_SUCCESSOR_HOST=%s", successorKey)) + env = append(env, fmt.Sprintf("ORC_FAILED_HOST=%s", demotedKey)) + + successorStr := fmt.Sprintf("%s", successorKey) + demotedStr := fmt.Sprintf("%s", demotedKey) + + processCount := len(config.Config.PostTakeMasterProcesses) + for i, command := range config.Config.PostTakeMasterProcesses { + fullDescription := fmt.Sprintf("PostTakeMasterProcesses hook %d of %d", i+1, processCount) + log.Debugf("Take-Master: PostTakeMasterProcesses: Calling %+s", fullDescription) + start := time.Now() + if err := os.CommandRun(command, env, successorStr, demotedStr); err == nil { + info := fmt.Sprintf("Completed %s in %v", fullDescription, time.Since(start)) + log.Infof("Take-Master: %s", info) + } else { + info := fmt.Sprintf("Execution of PostTakeMasterProcesses failed in %v with error: %v", time.Since(start), err) + log.Errorf("Take-Master: %s", info) + } + } + +} + +// TakeMaster will move an instance up the chain and cause its master to become its replica. +// It's almost a role change, just that other replicas of either 'instance' or its master are currently unaffected +// (they continue replicate without change) +// Note that the master must itself be a replica; however the grandparent does not necessarily have to be reachable +// and can in fact be dead. +func TakeMaster(instanceKey *InstanceKey, allowTakingCoMaster bool) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, fmt.Errorf("takeMaster: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + masterInstance, found, err := ReadInstance(&instance.MasterKey) + if err != nil || !found { + return instance, err + } + if masterInstance.IsCoMaster && !allowTakingCoMaster { + return instance, fmt.Errorf("%+v is co-master. Cannot take it.", masterInstance.Key) + } + log.Debugf("TakeMaster: will attempt making %+v take its master %+v, now resolved as %+v", *instanceKey, instance.MasterKey, masterInstance.Key) + + if canReplicate, err := masterInstance.CanReplicateFrom(instance); canReplicate == false { + return instance, err + } + // We begin + masterInstance, err = StopReplication(&masterInstance.Key) + if err != nil { + goto Cleanup + } + instance, err = StopReplication(&instance.Key) + if err != nil { + goto Cleanup + } + + instance, err = StartReplicationUntilMasterCoordinates(&instance.Key, &masterInstance.SelfBinlogCoordinates) + if err != nil { + goto Cleanup + } + + // instance and masterInstance are equal + // We skip name unresolve. It is OK if the master's master is dead, unreachable, does not resolve properly. + // We just copy+paste info from the master. + // In particular, this is commonly calledin DeadMaster recovery + instance, err = ChangeMasterTo(&instance.Key, &masterInstance.MasterKey, &masterInstance.ExecBinlogCoordinates, true, GTIDHintNeutral) + if err != nil { + goto Cleanup + } + // instance is now sibling of master + masterInstance, err = ChangeMasterTo(&masterInstance.Key, &instance.Key, &instance.SelfBinlogCoordinates, false, GTIDHintNeutral) + if err != nil { + goto Cleanup + } + // swap is done! + +Cleanup: + if instance != nil { + instance, _ = StartReplication(&instance.Key) + } + if masterInstance != nil { + masterInstance, _ = StartReplication(&masterInstance.Key) + } + if err != nil { + return instance, err + } + AuditOperation("take-master", instanceKey, fmt.Sprintf("took master: %+v", masterInstance.Key)) + + // Created this to enable a custom hook to be called after a TakeMaster success. + // This only runs if there is a hook configured in orchestrator.conf.json + demoted := masterInstance + successor := instance + if config.Config.PostTakeMasterProcesses != nil { + TakeMasterHook(successor, demoted) + } + + return instance, err +} + +// MakeLocalMaster promotes a replica above its master, making it replica of its grandparent, while also enslaving its siblings. +// This serves as a convenience method to recover replication when a local master fails; the instance promoted is one of its replicas, +// which is most advanced among its siblings. +// This method utilizes Pseudo GTID +func MakeLocalMaster(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + masterInstance, found, err := ReadInstance(&instance.MasterKey) + if err != nil || !found { + return instance, err + } + grandparentInstance, err := ReadTopologyInstance(&masterInstance.MasterKey) + if err != nil { + return instance, err + } + siblings, err := ReadReplicaInstances(&masterInstance.Key) + if err != nil { + return instance, err + } + for _, sibling := range siblings { + if instance.ExecBinlogCoordinates.SmallerThan(&sibling.ExecBinlogCoordinates) { + return instance, fmt.Errorf("MakeMaster: instance %+v has more advanced sibling: %+v", *instanceKey, sibling.Key) + } + } + + instance, err = StopReplicationNicely(instanceKey, 0) + if err != nil { + goto Cleanup + } + + _, _, err = MatchBelow(instanceKey, &grandparentInstance.Key, true) + if err != nil { + goto Cleanup + } + + _, _, err, _ = MultiMatchBelow(siblings, instanceKey, nil) + if err != nil { + goto Cleanup + } + +Cleanup: + if err != nil { + return instance, log.Errore(err) + } + // and we're done (pending deferred functions) + AuditOperation("make-local-master", instanceKey, fmt.Sprintf("made master of %+v", *instanceKey)) + + return instance, err +} + +// sortInstances shuffles given list of instances according to some logic +func sortInstancesDataCenterHint(instances [](*Instance), dataCenterHint string) { + sort.Sort(sort.Reverse(NewInstancesSorterByExec(instances, dataCenterHint))) +} + +// sortInstances shuffles given list of instances according to some logic +func sortInstances(instances [](*Instance)) { + sortInstancesDataCenterHint(instances, "") +} + +// getReplicasForSorting returns a list of replicas of a given master potentially for candidate choosing +func getReplicasForSorting(masterKey *InstanceKey, includeBinlogServerSubReplicas bool) (replicas [](*Instance), err error) { + if includeBinlogServerSubReplicas { + replicas, err = ReadReplicaInstancesIncludingBinlogServerSubReplicas(masterKey) + } else { + replicas, err = ReadReplicaInstances(masterKey) + } + return replicas, err +} + +func sortedReplicas(replicas [](*Instance), stopReplicationMethod StopReplicationMethod) [](*Instance) { + return sortedReplicasDataCenterHint(replicas, stopReplicationMethod, "") +} + +// sortedReplicas returns the list of replicas of some master, sorted by exec coordinates +// (most up-to-date replica first). +// This function assumes given `replicas` argument is indeed a list of instances all replicating +// from the same master (the result of `getReplicasForSorting()` is appropriate) +func sortedReplicasDataCenterHint(replicas [](*Instance), stopReplicationMethod StopReplicationMethod, dataCenterHint string) [](*Instance) { + if len(replicas) <= 1 { + return replicas + } + replicas = StopReplicas(replicas, stopReplicationMethod, time.Duration(config.Config.InstanceBulkOperationsWaitTimeoutSeconds)*time.Second) + replicas = RemoveNilInstances(replicas) + + sortInstancesDataCenterHint(replicas, dataCenterHint) + for _, replica := range replicas { + log.Debugf("- sorted replica: %+v %+v", replica.Key, replica.ExecBinlogCoordinates) + } + + return replicas +} + +// GetSortedReplicas reads list of replicas of a given master, and returns them sorted by exec coordinates +// (most up-to-date replica first). +func GetSortedReplicas(masterKey *InstanceKey, stopReplicationMethod StopReplicationMethod) (replicas [](*Instance), err error) { + if replicas, err = getReplicasForSorting(masterKey, false); err != nil { + return replicas, err + } + replicas = sortedReplicas(replicas, stopReplicationMethod) + if len(replicas) == 0 { + return replicas, fmt.Errorf("No replicas found for %+v", *masterKey) + } + return replicas, err +} + +// MultiMatchBelow will efficiently match multiple replicas below a given instance. +// It is assumed that all given replicas are siblings +func MultiMatchBelow(replicas [](*Instance), belowKey *InstanceKey, postponedFunctionsContainer *PostponedFunctionsContainer) (matchedReplicas [](*Instance), belowInstance *Instance, err error, errs []error) { + belowInstance, found, err := ReadInstance(belowKey) + if err != nil || !found { + return matchedReplicas, belowInstance, err, errs + } + + replicas = RemoveInstance(replicas, belowKey) + if len(replicas) == 0 { + // Nothing to do + return replicas, belowInstance, err, errs + } + + log.Infof("Will match %+v replicas below %+v via Pseudo-GTID, independently", len(replicas), belowKey) + + barrier := make(chan *InstanceKey) + replicaMutex := &sync.Mutex{} + + for _, replica := range replicas { + replica := replica + + // Parallelize repoints + go func() { + defer func() { barrier <- &replica.Key }() + matchFunc := func() error { + replica, _, replicaErr := MatchBelow(&replica.Key, belowKey, true) + + replicaMutex.Lock() + defer replicaMutex.Unlock() + + if replicaErr == nil { + matchedReplicas = append(matchedReplicas, replica) + } else { + errs = append(errs, replicaErr) + } + return replicaErr + } + if shouldPostponeRelocatingReplica(replica, postponedFunctionsContainer) { + postponedFunctionsContainer.AddPostponedFunction(matchFunc, fmt.Sprintf("multi-match-below-independent %+v", replica.Key)) + // We bail out and trust our invoker to later call upon this postponed function + } else { + ExecuteOnTopology(func() { matchFunc() }) + } + }() + } + for range replicas { + <-barrier + } + if len(errs) == len(replicas) { + // All returned with error + return matchedReplicas, belowInstance, fmt.Errorf("MultiMatchBelowIndependently: Error on all %+v operations", len(errs)), errs + } + AuditOperation("multi-match-below-independent", belowKey, fmt.Sprintf("matched %d/%d replicas below %+v via Pseudo-GTID", len(matchedReplicas), len(replicas), belowKey)) + + return matchedReplicas, belowInstance, err, errs +} + +// MultiMatchReplicas will match (via pseudo-gtid) all replicas of given master below given instance. +func MultiMatchReplicas(masterKey *InstanceKey, belowKey *InstanceKey, pattern string) ([](*Instance), *Instance, error, []error) { + res := [](*Instance){} + errs := []error{} + + belowInstance, err := ReadTopologyInstance(belowKey) + if err != nil { + // Can't access "below" ==> can't match replicas beneath it + return res, nil, err, errs + } + + masterInstance, found, err := ReadInstance(masterKey) + if err != nil || !found { + return res, nil, err, errs + } + + // See if we have a binlog server case (special handling): + binlogCase := false + if masterInstance.IsBinlogServer() && masterInstance.MasterKey.Equals(belowKey) { + // repoint-up + log.Debugf("MultiMatchReplicas: pointing replicas up from binlog server") + binlogCase = true + } else if belowInstance.IsBinlogServer() && belowInstance.MasterKey.Equals(masterKey) { + // repoint-down + log.Debugf("MultiMatchReplicas: pointing replicas down to binlog server") + binlogCase = true + } else if masterInstance.IsBinlogServer() && belowInstance.IsBinlogServer() && masterInstance.MasterKey.Equals(&belowInstance.MasterKey) { + // Both BLS, siblings + log.Debugf("MultiMatchReplicas: pointing replicas to binlong sibling") + binlogCase = true + } + if binlogCase { + replicas, err, errors := RepointReplicasTo(masterKey, pattern, belowKey) + // Bail out! + return replicas, masterInstance, err, errors + } + + // Not binlog server + + // replicas involved + replicas, err := ReadReplicaInstancesIncludingBinlogServerSubReplicas(masterKey) + if err != nil { + return res, belowInstance, err, errs + } + replicas = filterInstancesByPattern(replicas, pattern) + matchedReplicas, belowInstance, err, errs := MultiMatchBelow(replicas, &belowInstance.Key, nil) + + if len(matchedReplicas) != len(replicas) { + err = fmt.Errorf("MultiMatchReplicas: only matched %d out of %d replicas of %+v; error is: %+v", len(matchedReplicas), len(replicas), *masterKey, err) + } + AuditOperation("multi-match-replicas", masterKey, fmt.Sprintf("matched %d replicas under %+v", len(matchedReplicas), *belowKey)) + + return matchedReplicas, belowInstance, err, errs +} + +// MatchUp will move a replica up the replication chain, so that it becomes sibling of its master, via Pseudo-GTID +func MatchUp(instanceKey *InstanceKey, requireInstanceMaintenance bool) (*Instance, *BinlogCoordinates, error) { + instance, found, err := ReadInstance(instanceKey) + if err != nil || !found { + return nil, nil, err + } + if !instance.IsReplica() { + return instance, nil, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, nil, fmt.Errorf("MatchUp: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + master, found, err := ReadInstance(&instance.MasterKey) + if err != nil || !found { + return instance, nil, log.Errorf("Cannot get master for %+v. error=%+v", instance.Key, err) + } + + if !master.IsReplica() { + return instance, nil, fmt.Errorf("master is not a replica itself: %+v", master.Key) + } + + return MatchBelow(instanceKey, &master.MasterKey, requireInstanceMaintenance) +} + +// MatchUpReplicas will move all replicas of given master up the replication chain, +// so that they become siblings of their master. +// This should be called when the local master dies, and all its replicas are to be resurrected via Pseudo-GTID +func MatchUpReplicas(masterKey *InstanceKey, pattern string) ([](*Instance), *Instance, error, []error) { + res := [](*Instance){} + errs := []error{} + + masterInstance, found, err := ReadInstance(masterKey) + if err != nil || !found { + return res, nil, err, errs + } + + return MultiMatchReplicas(masterKey, &masterInstance.MasterKey, pattern) +} + +func isGenerallyValidAsBinlogSource(replica *Instance) bool { + if !replica.IsLastCheckValid { + // something wrong with this replica right now. We shouldn't hope to be able to promote it + return false + } + if !replica.LogBinEnabled { + return false + } + if !replica.LogReplicationUpdatesEnabled { + return false + } + + return true +} + +func isGenerallyValidAsCandidateReplica(replica *Instance) bool { + if !isGenerallyValidAsBinlogSource(replica) { + // does not have binary logs + return false + } + if replica.IsBinlogServer() { + // Can't regroup under a binlog server because it does not support pseudo-gtid related queries such as SHOW BINLOG EVENTS + return false + } + + return true +} + +// isValidAsCandidateMasterInBinlogServerTopology let's us know whether a given replica is generally +// valid to promote to be master. +func isValidAsCandidateMasterInBinlogServerTopology(replica *Instance) bool { + if !replica.IsLastCheckValid { + // something wrong with this replica right now. We shouldn't hope to be able to promote it + return false + } + if !replica.LogBinEnabled { + return false + } + if replica.LogReplicationUpdatesEnabled { + // That's right: we *disallow* log-replica-updates + return false + } + if replica.IsBinlogServer() { + return false + } + + return true +} + +func IsBannedFromBeingCandidateReplica(replica *Instance) bool { + if replica.PromotionRule == MustNotPromoteRule { + log.Debugf("instance %+v is banned because of promotion rule", replica.Key) + return true + } + for _, filter := range config.Config.PromotionIgnoreHostnameFilters { + if matched, _ := regexp.MatchString(filter, replica.Key.Hostname); matched { + return true + } + } + return false +} + +// getPriorityMajorVersionForCandidate returns the primary (most common) major version found +// among given instances. This will be used for choosing best candidate for promotion. +func getPriorityMajorVersionForCandidate(replicas [](*Instance)) (priorityMajorVersion string, err error) { + if len(replicas) == 0 { + return "", log.Errorf("empty replicas list in getPriorityMajorVersionForCandidate") + } + majorVersionsCount := make(map[string]int) + for _, replica := range replicas { + majorVersionsCount[replica.MajorVersionString()] = majorVersionsCount[replica.MajorVersionString()] + 1 + } + if len(majorVersionsCount) == 1 { + // all same version, simple case + return replicas[0].MajorVersionString(), nil + } + sorted := NewMajorVersionsSortedByCount(majorVersionsCount) + sort.Sort(sort.Reverse(sorted)) + return sorted.First(), nil +} + +// getPriorityBinlogFormatForCandidate returns the primary (most common) binlog format found +// among given instances. This will be used for choosing best candidate for promotion. +func getPriorityBinlogFormatForCandidate(replicas [](*Instance)) (priorityBinlogFormat string, err error) { + if len(replicas) == 0 { + return "", log.Errorf("empty replicas list in getPriorityBinlogFormatForCandidate") + } + binlogFormatsCount := make(map[string]int) + for _, replica := range replicas { + binlogFormatsCount[replica.Binlog_format] = binlogFormatsCount[replica.Binlog_format] + 1 + } + if len(binlogFormatsCount) == 1 { + // all same binlog format, simple case + return replicas[0].Binlog_format, nil + } + sorted := NewBinlogFormatSortedByCount(binlogFormatsCount) + sort.Sort(sort.Reverse(sorted)) + return sorted.First(), nil +} + +// chooseCandidateReplica +func chooseCandidateReplica(replicas [](*Instance)) (candidateReplica *Instance, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas [](*Instance), err error) { + if len(replicas) == 0 { + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, fmt.Errorf("No replicas found given in chooseCandidateReplica") + } + priorityMajorVersion, _ := getPriorityMajorVersionForCandidate(replicas) + priorityBinlogFormat, _ := getPriorityBinlogFormatForCandidate(replicas) + + for _, replica := range replicas { + replica := replica + if isGenerallyValidAsCandidateReplica(replica) && + !IsBannedFromBeingCandidateReplica(replica) && + !IsSmallerMajorVersion(priorityMajorVersion, replica.MajorVersionString()) && + !IsSmallerBinlogFormat(priorityBinlogFormat, replica.Binlog_format) { + // this is the one + candidateReplica = replica + break + } + } + if candidateReplica == nil { + // Unable to find a candidate that will master others. + // Instead, pick a (single) replica which is not banned. + for _, replica := range replicas { + replica := replica + if !IsBannedFromBeingCandidateReplica(replica) { + // this is the one + candidateReplica = replica + break + } + } + if candidateReplica != nil { + replicas = RemoveInstance(replicas, &candidateReplica.Key) + } + return candidateReplica, replicas, equalReplicas, laterReplicas, cannotReplicateReplicas, fmt.Errorf("chooseCandidateReplica: no candidate replica found") + } + replicas = RemoveInstance(replicas, &candidateReplica.Key) + for _, replica := range replicas { + replica := replica + if canReplicate, err := replica.CanReplicateFrom(candidateReplica); !canReplicate { + // lost due to inability to replicate + cannotReplicateReplicas = append(cannotReplicateReplicas, replica) + if err != nil { + log.Errorf("chooseCandidateReplica(): error checking CanReplicateFrom(). replica: %v; error: %v", replica.Key, err) + } + } else if replica.ExecBinlogCoordinates.SmallerThan(&candidateReplica.ExecBinlogCoordinates) { + laterReplicas = append(laterReplicas, replica) + } else if replica.ExecBinlogCoordinates.Equals(&candidateReplica.ExecBinlogCoordinates) { + equalReplicas = append(equalReplicas, replica) + } else { + // lost due to being more advanced/ahead of chosen replica. + aheadReplicas = append(aheadReplicas, replica) + } + } + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err +} + +// GetCandidateReplica chooses the best replica to promote given a (possibly dead) master +func GetCandidateReplica(masterKey *InstanceKey, forRematchPurposes bool) (*Instance, [](*Instance), [](*Instance), [](*Instance), [](*Instance), error) { + var candidateReplica *Instance + aheadReplicas := [](*Instance){} + equalReplicas := [](*Instance){} + laterReplicas := [](*Instance){} + cannotReplicateReplicas := [](*Instance){} + + dataCenterHint := "" + if master, _, _ := ReadInstance(masterKey); master != nil { + dataCenterHint = master.DataCenter + } + replicas, err := getReplicasForSorting(masterKey, false) + if err != nil { + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err + } + stopReplicationMethod := NoStopReplication + if forRematchPurposes { + stopReplicationMethod = StopReplicationNice + } + replicas = sortedReplicasDataCenterHint(replicas, stopReplicationMethod, dataCenterHint) + if err != nil { + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err + } + if len(replicas) == 0 { + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, fmt.Errorf("No replicas found for %+v", *masterKey) + } + candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err = chooseCandidateReplica(replicas) + if err != nil { + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err + } + if candidateReplica != nil { + mostUpToDateReplica := replicas[0] + if candidateReplica.ExecBinlogCoordinates.SmallerThan(&mostUpToDateReplica.ExecBinlogCoordinates) { + log.Warningf("GetCandidateReplica: chosen replica: %+v is behind most-up-to-date replica: %+v", candidateReplica.Key, mostUpToDateReplica.Key) + } + } + log.Debugf("GetCandidateReplica: candidate: %+v, ahead: %d, equal: %d, late: %d, break: %d", candidateReplica.Key, len(aheadReplicas), len(equalReplicas), len(laterReplicas), len(cannotReplicateReplicas)) + return candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, nil +} + +// GetCandidateReplicaOfBinlogServerTopology chooses the best replica to promote given a (possibly dead) master +func GetCandidateReplicaOfBinlogServerTopology(masterKey *InstanceKey) (candidateReplica *Instance, err error) { + replicas, err := getReplicasForSorting(masterKey, true) + if err != nil { + return candidateReplica, err + } + replicas = sortedReplicas(replicas, NoStopReplication) + if len(replicas) == 0 { + return candidateReplica, fmt.Errorf("No replicas found for %+v", *masterKey) + } + for _, replica := range replicas { + replica := replica + if candidateReplica != nil { + break + } + if isValidAsCandidateMasterInBinlogServerTopology(replica) && !IsBannedFromBeingCandidateReplica(replica) { + // this is the one + candidateReplica = replica + } + } + if candidateReplica != nil { + log.Debugf("GetCandidateReplicaOfBinlogServerTopology: returning %+v as candidate replica for %+v", candidateReplica.Key, *masterKey) + } else { + log.Debugf("GetCandidateReplicaOfBinlogServerTopology: no candidate replica found for %+v", *masterKey) + } + return candidateReplica, err +} + +// RegroupReplicasPseudoGTID will choose a candidate replica of a given instance, and take its siblings using pseudo-gtid +func RegroupReplicasPseudoGTID( + masterKey *InstanceKey, + returnReplicaEvenOnFailureToRegroup bool, + onCandidateReplicaChosen func(*Instance), + postponedFunctionsContainer *PostponedFunctionsContainer, + postponeAllMatchOperations func(*Instance, bool) bool, +) ( + aheadReplicas [](*Instance), + equalReplicas [](*Instance), + laterReplicas [](*Instance), + cannotReplicateReplicas [](*Instance), + candidateReplica *Instance, + err error, +) { + candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err = GetCandidateReplica(masterKey, true) + if err != nil { + if !returnReplicaEvenOnFailureToRegroup { + candidateReplica = nil + } + return aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, candidateReplica, err + } + + if config.Config.PseudoGTIDPattern == "" { + return aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, candidateReplica, fmt.Errorf("PseudoGTIDPattern not configured; cannot use Pseudo-GTID") + } + + if onCandidateReplicaChosen != nil { + onCandidateReplicaChosen(candidateReplica) + } + + allMatchingFunc := func() error { + log.Debugf("RegroupReplicas: working on %d equals replicas", len(equalReplicas)) + barrier := make(chan *InstanceKey) + for _, replica := range equalReplicas { + replica := replica + // This replica has the exact same executing coordinates as the candidate replica. This replica + // is *extremely* easy to attach below the candidate replica! + go func() { + defer func() { barrier <- &candidateReplica.Key }() + ExecuteOnTopology(func() { + ChangeMasterTo(&replica.Key, &candidateReplica.Key, &candidateReplica.SelfBinlogCoordinates, false, GTIDHintDeny) + }) + }() + } + for range equalReplicas { + <-barrier + } + + log.Debugf("RegroupReplicas: multi matching %d later replicas", len(laterReplicas)) + // As for the laterReplicas, we'll have to apply pseudo GTID + laterReplicas, candidateReplica, err, _ = MultiMatchBelow(laterReplicas, &candidateReplica.Key, postponedFunctionsContainer) + + operatedReplicas := append(equalReplicas, candidateReplica) + operatedReplicas = append(operatedReplicas, laterReplicas...) + log.Debugf("RegroupReplicas: starting %d replicas", len(operatedReplicas)) + barrier = make(chan *InstanceKey) + for _, replica := range operatedReplicas { + replica := replica + go func() { + defer func() { barrier <- &candidateReplica.Key }() + ExecuteOnTopology(func() { + StartReplication(&replica.Key) + }) + }() + } + for range operatedReplicas { + <-barrier + } + AuditOperation("regroup-replicas", masterKey, fmt.Sprintf("regrouped %+v replicas below %+v", len(operatedReplicas), *masterKey)) + return err + } + if postponedFunctionsContainer != nil && postponeAllMatchOperations != nil && postponeAllMatchOperations(candidateReplica, false) { + postponedFunctionsContainer.AddPostponedFunction(allMatchingFunc, fmt.Sprintf("regroup-replicas-pseudo-gtid %+v", candidateReplica.Key)) + } else { + err = allMatchingFunc() + } + log.Debugf("RegroupReplicas: done") + // aheadReplicas are lost (they were ahead in replication as compared to promoted replica) + return aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, candidateReplica, err +} + +func getMostUpToDateActiveBinlogServer(masterKey *InstanceKey) (mostAdvancedBinlogServer *Instance, binlogServerReplicas [](*Instance), err error) { + if binlogServerReplicas, err = ReadBinlogServerReplicaInstances(masterKey); err == nil && len(binlogServerReplicas) > 0 { + // Pick the most advanced binlog sever that is good to go + for _, binlogServer := range binlogServerReplicas { + if binlogServer.IsLastCheckValid { + if mostAdvancedBinlogServer == nil { + mostAdvancedBinlogServer = binlogServer + } + if mostAdvancedBinlogServer.ExecBinlogCoordinates.SmallerThan(&binlogServer.ExecBinlogCoordinates) { + mostAdvancedBinlogServer = binlogServer + } + } + } + } + return mostAdvancedBinlogServer, binlogServerReplicas, err +} + +// RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers uses Pseugo-GTID to regroup replicas +// of given instance. The function also drill in to replicas of binlog servers that are replicating from given instance, +// and other recursive binlog servers, as long as they're in the same binlog-server-family. +func RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers( + masterKey *InstanceKey, + returnReplicaEvenOnFailureToRegroup bool, + onCandidateReplicaChosen func(*Instance), + postponedFunctionsContainer *PostponedFunctionsContainer, + postponeAllMatchOperations func(*Instance, bool) bool, +) ( + aheadReplicas [](*Instance), + equalReplicas [](*Instance), + laterReplicas [](*Instance), + cannotReplicateReplicas [](*Instance), + candidateReplica *Instance, + err error, +) { + // First, handle binlog server issues: + func() error { + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: starting on replicas of %+v", *masterKey) + // Find the most up to date binlog server: + mostUpToDateBinlogServer, binlogServerReplicas, err := getMostUpToDateActiveBinlogServer(masterKey) + if err != nil { + return log.Errore(err) + } + if mostUpToDateBinlogServer == nil { + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: no binlog server replicates from %+v", *masterKey) + // No binlog server; proceed as normal + return nil + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: most up to date binlog server of %+v: %+v", *masterKey, mostUpToDateBinlogServer.Key) + + // Find the most up to date candidate replica: + candidateReplica, _, _, _, _, err := GetCandidateReplica(masterKey, true) + if err != nil { + return log.Errore(err) + } + if candidateReplica == nil { + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: no candidate replica for %+v", *masterKey) + // Let the followup code handle that + return nil + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: candidate replica of %+v: %+v", *masterKey, candidateReplica.Key) + + if candidateReplica.ExecBinlogCoordinates.SmallerThan(&mostUpToDateBinlogServer.ExecBinlogCoordinates) { + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: candidate replica %+v coordinates smaller than binlog server %+v", candidateReplica.Key, mostUpToDateBinlogServer.Key) + // Need to align under binlog server... + candidateReplica, err = Repoint(&candidateReplica.Key, &mostUpToDateBinlogServer.Key, GTIDHintDeny) + if err != nil { + return log.Errore(err) + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: repointed candidate replica %+v under binlog server %+v", candidateReplica.Key, mostUpToDateBinlogServer.Key) + candidateReplica, err = StartReplicationUntilMasterCoordinates(&candidateReplica.Key, &mostUpToDateBinlogServer.ExecBinlogCoordinates) + if err != nil { + return log.Errore(err) + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: aligned candidate replica %+v under binlog server %+v", candidateReplica.Key, mostUpToDateBinlogServer.Key) + // and move back + candidateReplica, err = Repoint(&candidateReplica.Key, masterKey, GTIDHintDeny) + if err != nil { + return log.Errore(err) + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: repointed candidate replica %+v under master %+v", candidateReplica.Key, *masterKey) + return nil + } + // Either because it _was_ like that, or we _made_ it so, + // candidate replica is as/more up to date than all binlog servers + for _, binlogServer := range binlogServerReplicas { + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: matching replicas of binlog server %+v below %+v", binlogServer.Key, candidateReplica.Key) + // Right now sequentially. + // At this point just do what you can, don't return an error + MultiMatchReplicas(&binlogServer.Key, &candidateReplica.Key, "") + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: done matching replicas of binlog server %+v below %+v", binlogServer.Key, candidateReplica.Key) + } + log.Debugf("RegroupReplicasIncludingSubReplicasOfBinlogServers: done handling binlog regrouping for %+v; will proceed with normal RegroupReplicas", *masterKey) + AuditOperation("regroup-replicas-including-bls", masterKey, fmt.Sprintf("matched replicas of binlog server replicas of %+v under %+v", *masterKey, candidateReplica.Key)) + return nil + }() + // Proceed to normal regroup: + return RegroupReplicasPseudoGTID(masterKey, returnReplicaEvenOnFailureToRegroup, onCandidateReplicaChosen, postponedFunctionsContainer, postponeAllMatchOperations) +} + +// RegroupReplicasGTID will choose a candidate replica of a given instance, and take its siblings using GTID +func RegroupReplicasGTID( + masterKey *InstanceKey, + returnReplicaEvenOnFailureToRegroup bool, + onCandidateReplicaChosen func(*Instance), + postponedFunctionsContainer *PostponedFunctionsContainer, + postponeAllMatchOperations func(*Instance, bool) bool, +) ( + lostReplicas [](*Instance), + movedReplicas [](*Instance), + cannotReplicateReplicas [](*Instance), + candidateReplica *Instance, + err error, +) { + var emptyReplicas [](*Instance) + var unmovedReplicas [](*Instance) + candidateReplica, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := GetCandidateReplica(masterKey, true) + if err != nil { + if !returnReplicaEvenOnFailureToRegroup { + candidateReplica = nil + } + return emptyReplicas, emptyReplicas, emptyReplicas, candidateReplica, err + } + + if onCandidateReplicaChosen != nil { + onCandidateReplicaChosen(candidateReplica) + } + replicasToMove := append(equalReplicas, laterReplicas...) + hasBestPromotionRule := true + if candidateReplica != nil { + for _, replica := range replicasToMove { + if replica.PromotionRule.BetterThan(candidateReplica.PromotionRule) { + hasBestPromotionRule = false + } + } + } + moveGTIDFunc := func() error { + log.Debugf("RegroupReplicasGTID: working on %d replicas", len(replicasToMove)) + + movedReplicas, unmovedReplicas, err, _ = moveReplicasViaGTID(replicasToMove, candidateReplica, postponedFunctionsContainer) + unmovedReplicas = append(unmovedReplicas, aheadReplicas...) + return log.Errore(err) + } + if postponedFunctionsContainer != nil && postponeAllMatchOperations != nil && postponeAllMatchOperations(candidateReplica, hasBestPromotionRule) { + postponedFunctionsContainer.AddPostponedFunction(moveGTIDFunc, fmt.Sprintf("regroup-replicas-gtid %+v", candidateReplica.Key)) + } else { + err = moveGTIDFunc() + } + + StartReplication(&candidateReplica.Key) + + log.Debugf("RegroupReplicasGTID: done") + AuditOperation("regroup-replicas-gtid", masterKey, fmt.Sprintf("regrouped replicas of %+v via GTID; promoted %+v", *masterKey, candidateReplica.Key)) + return unmovedReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err +} + +// RegroupReplicasBinlogServers works on a binlog-servers topology. It picks the most up-to-date BLS and repoints all other +// BLS below it +func RegroupReplicasBinlogServers(masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup bool) (repointedBinlogServers [](*Instance), promotedBinlogServer *Instance, err error) { + var binlogServerReplicas [](*Instance) + promotedBinlogServer, binlogServerReplicas, err = getMostUpToDateActiveBinlogServer(masterKey) + + resultOnError := func(err error) ([](*Instance), *Instance, error) { + if !returnReplicaEvenOnFailureToRegroup { + promotedBinlogServer = nil + } + return repointedBinlogServers, promotedBinlogServer, err + } + + if err != nil { + return resultOnError(err) + } + + repointedBinlogServers, err, _ = RepointTo(binlogServerReplicas, &promotedBinlogServer.Key) + + if err != nil { + return resultOnError(err) + } + AuditOperation("regroup-replicas-bls", masterKey, fmt.Sprintf("regrouped binlog server replicas of %+v; promoted %+v", *masterKey, promotedBinlogServer.Key)) + return repointedBinlogServers, promotedBinlogServer, nil +} + +// RegroupReplicas is a "smart" method of promoting one replica over the others ("promoting" it on top of its siblings) +// This method decides which strategy to use: GTID, Pseudo-GTID, Binlog Servers. +func RegroupReplicas(masterKey *InstanceKey, returnReplicaEvenOnFailureToRegroup bool, + onCandidateReplicaChosen func(*Instance), + postponedFunctionsContainer *PostponedFunctionsContainer) ( + + aheadReplicas [](*Instance), + equalReplicas [](*Instance), + laterReplicas [](*Instance), + cannotReplicateReplicas [](*Instance), + instance *Instance, + err error, +) { + // + var emptyReplicas [](*Instance) + + replicas, err := ReadReplicaInstances(masterKey) + if err != nil { + return emptyReplicas, emptyReplicas, emptyReplicas, emptyReplicas, instance, err + } + if len(replicas) == 0 { + return emptyReplicas, emptyReplicas, emptyReplicas, emptyReplicas, instance, err + } + if len(replicas) == 1 { + return emptyReplicas, emptyReplicas, emptyReplicas, emptyReplicas, replicas[0], err + } + allGTID := true + allBinlogServers := true + allPseudoGTID := true + for _, replica := range replicas { + if !replica.UsingGTID() { + allGTID = false + } + if !replica.IsBinlogServer() { + allBinlogServers = false + } + if !replica.UsingPseudoGTID { + allPseudoGTID = false + } + } + if allGTID { + log.Debugf("RegroupReplicas: using GTID to regroup replicas of %+v", *masterKey) + unmovedReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err := RegroupReplicasGTID(masterKey, returnReplicaEvenOnFailureToRegroup, onCandidateReplicaChosen, nil, nil) + return unmovedReplicas, emptyReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err + } + if allBinlogServers { + log.Debugf("RegroupReplicas: using binlog servers to regroup replicas of %+v", *masterKey) + movedReplicas, candidateReplica, err := RegroupReplicasBinlogServers(masterKey, returnReplicaEvenOnFailureToRegroup) + return emptyReplicas, emptyReplicas, movedReplicas, cannotReplicateReplicas, candidateReplica, err + } + if allPseudoGTID { + log.Debugf("RegroupReplicas: using Pseudo-GTID to regroup replicas of %+v", *masterKey) + return RegroupReplicasPseudoGTID(masterKey, returnReplicaEvenOnFailureToRegroup, onCandidateReplicaChosen, postponedFunctionsContainer, nil) + } + // And, as last resort, we do PseudoGTID & binlog servers + log.Warningf("RegroupReplicas: unsure what method to invoke for %+v; trying Pseudo-GTID+Binlog Servers", *masterKey) + return RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers(masterKey, returnReplicaEvenOnFailureToRegroup, onCandidateReplicaChosen, postponedFunctionsContainer, nil) +} + +// relocateBelowInternal is a protentially recursive function which chooses how to relocate an instance below another. +// It may choose to use Pseudo-GTID, or normal binlog positions, or take advantage of binlog servers, +// or it may combine any of the above in a multi-step operation. +func relocateBelowInternal(instance, other *Instance) (*Instance, error) { + if canReplicate, err := instance.CanReplicateFrom(other); !canReplicate { + return instance, log.Errorf("%+v cannot replicate from %+v. Reason: %+v", instance.Key, other.Key, err) + } + // simplest: + if InstanceIsMasterOf(other, instance) { + // already the desired setup. + return Repoint(&instance.Key, &other.Key, GTIDHintNeutral) + } + // Do we have record of equivalent coordinates? + if !instance.IsBinlogServer() { + if movedInstance, err := MoveEquivalent(&instance.Key, &other.Key); err == nil { + return movedInstance, nil + } + } + // Try and take advantage of binlog servers: + if InstancesAreSiblings(instance, other) && other.IsBinlogServer() { + return MoveBelow(&instance.Key, &other.Key) + } + instanceMaster, _, err := ReadInstance(&instance.MasterKey) + if err != nil { + return instance, err + } + if instanceMaster != nil && instanceMaster.MasterKey.Equals(&other.Key) && instanceMaster.IsBinlogServer() { + // Moving to grandparent via binlog server + return Repoint(&instance.Key, &instanceMaster.MasterKey, GTIDHintDeny) + } + if other.IsBinlogServer() { + if instanceMaster != nil && instanceMaster.IsBinlogServer() && InstancesAreSiblings(instanceMaster, other) { + // Special case: this is a binlog server family; we move under the uncle, in one single step + return Repoint(&instance.Key, &other.Key, GTIDHintDeny) + } + + // Relocate to its master, then repoint to the binlog server + otherMaster, found, err := ReadInstance(&other.MasterKey) + if err != nil { + return instance, err + } + if !found { + return instance, log.Errorf("Cannot find master %+v", other.MasterKey) + } + if !other.IsLastCheckValid { + return instance, log.Errorf("Binlog server %+v is not reachable. It would take two steps to relocate %+v below it, and I won't even do the first step.", other.Key, instance.Key) + } + + log.Debugf("Relocating to a binlog server; will first attempt to relocate to the binlog server's master: %+v, and then repoint down", otherMaster.Key) + if _, err := relocateBelowInternal(instance, otherMaster); err != nil { + return instance, err + } + return Repoint(&instance.Key, &other.Key, GTIDHintDeny) + } + if instance.IsBinlogServer() { + // Can only move within the binlog-server family tree + // And these have been covered just now: move up from a master binlog server, move below a binling binlog server. + // sure, the family can be more complex, but we keep these operations atomic + return nil, log.Errorf("Relocating binlog server %+v below %+v turns to be too complex; please do it manually", instance.Key, other.Key) + } + // Next, try GTID + if _, _, gtidCompatible := instancesAreGTIDAndCompatible(instance, other); gtidCompatible { + return moveInstanceBelowViaGTID(instance, other) + } + + // Next, try Pseudo-GTID + if instance.UsingPseudoGTID && other.UsingPseudoGTID { + // We prefer PseudoGTID to anything else because, while it takes longer to run, it does not issue + // a STOP SLAVE on any server other than "instance" itself. + instance, _, err := MatchBelow(&instance.Key, &other.Key, true) + return instance, err + } + // No Pseudo-GTID; cehck simple binlog file/pos operations: + if InstancesAreSiblings(instance, other) { + // If comastering, only move below if it's read-only + if !other.IsCoMaster || other.ReadOnly { + return MoveBelow(&instance.Key, &other.Key) + } + } + // See if we need to MoveUp + if instanceMaster != nil && instanceMaster.MasterKey.Equals(&other.Key) { + // Moving to grandparent--handles co-mastering writable case + return MoveUp(&instance.Key) + } + if instanceMaster != nil && instanceMaster.IsBinlogServer() { + // Break operation into two: move (repoint) up, then continue + if _, err := MoveUp(&instance.Key); err != nil { + return instance, err + } + return relocateBelowInternal(instance, other) + } + // Too complex + return nil, log.Errorf("Relocating %+v below %+v turns to be too complex; please do it manually", instance.Key, other.Key) +} + +// RelocateBelow will attempt moving instance indicated by instanceKey below another instance. +// Orchestrator will try and figure out the best way to relocate the server. This could span normal +// binlog-position, pseudo-gtid, repointing, binlog servers... +func RelocateBelow(instanceKey, otherKey *InstanceKey) (*Instance, error) { + instance, found, err := ReadInstance(instanceKey) + if err != nil || !found { + return instance, log.Errorf("Error reading %+v", *instanceKey) + } + // Relocation of group secondaries makes no sense, group secondaries, by definition, always replicate from the group + // primary + if instance.IsReplicationGroupSecondary() { + return instance, log.Errorf("relocate: %+v is a secondary replication group member, hence, it cannot be relocated", instance.Key) + } + other, found, err := ReadInstance(otherKey) + if err != nil || !found { + return instance, log.Errorf("Error reading %+v", *otherKey) + } + // Disallow setting up a group primary to replicate from a group secondary + if instance.IsReplicationGroupPrimary() && other.ReplicationGroupName == instance.ReplicationGroupName { + return instance, log.Errorf("relocate: Setting a group primary to replicate from another member of its group is disallowed") + } + if other.IsDescendantOf(instance) { + return instance, log.Errorf("relocate: %+v is a descendant of %+v", *otherKey, instance.Key) + } + instance, err = relocateBelowInternal(instance, other) + if err == nil { + AuditOperation("relocate-below", instanceKey, fmt.Sprintf("relocated %+v below %+v", *instanceKey, *otherKey)) + } + return instance, err +} + +// relocateReplicasInternal is a protentially recursive function which chooses how to relocate +// replicas of an instance below another. +// It may choose to use Pseudo-GTID, or normal binlog positions, or take advantage of binlog servers, +// or it may combine any of the above in a multi-step operation. +func relocateReplicasInternal(replicas [](*Instance), instance, other *Instance) ([](*Instance), error, []error) { + errs := []error{} + var err error + // simplest: + if instance.Key.Equals(&other.Key) { + // already the desired setup. + return RepointTo(replicas, &other.Key) + } + // Try and take advantage of binlog servers: + if InstanceIsMasterOf(other, instance) && instance.IsBinlogServer() { + // Up from a binlog server + return RepointTo(replicas, &other.Key) + } + if InstanceIsMasterOf(instance, other) && other.IsBinlogServer() { + // Down under a binlog server + return RepointTo(replicas, &other.Key) + } + if InstancesAreSiblings(instance, other) && instance.IsBinlogServer() && other.IsBinlogServer() { + // Between siblings + return RepointTo(replicas, &other.Key) + } + if other.IsBinlogServer() { + // Relocate to binlog server's parent (recursive call), then repoint down + otherMaster, found, err := ReadInstance(&other.MasterKey) + if err != nil || !found { + return nil, err, errs + } + replicas, err, errs = relocateReplicasInternal(replicas, instance, otherMaster) + if err != nil { + return replicas, err, errs + } + + return RepointTo(replicas, &other.Key) + } + // GTID + { + movedReplicas, unmovedReplicas, err, errs := moveReplicasViaGTID(replicas, other, nil) + + if len(movedReplicas) == len(replicas) { + // Moved (or tried moving) everything via GTID + return movedReplicas, err, errs + } else if len(movedReplicas) > 0 { + // something was moved via GTID; let's try further on + return relocateReplicasInternal(unmovedReplicas, instance, other) + } + // Otherwise nothing was moved via GTID. Maybe we don't have any GTIDs, we continue. + } + + // Pseudo GTID + if other.UsingPseudoGTID { + // Which replicas are using Pseudo GTID? + var pseudoGTIDReplicas [](*Instance) + for _, replica := range replicas { + _, _, hasToBeGTID := instancesAreGTIDAndCompatible(replica, other) + if replica.UsingPseudoGTID && !hasToBeGTID { + pseudoGTIDReplicas = append(pseudoGTIDReplicas, replica) + } + } + pseudoGTIDReplicas, _, err, errs = MultiMatchBelow(pseudoGTIDReplicas, &other.Key, nil) + return pseudoGTIDReplicas, err, errs + } + + // Normal binlog file:pos + if InstanceIsMasterOf(other, instance) { + // MoveUpReplicas -- but not supporting "replicas" argument at this time. + } + + // Too complex + return nil, log.Errorf("Relocating %+v replicas of %+v below %+v turns to be too complex; please do it manually", len(replicas), instance.Key, other.Key), errs +} + +// RelocateReplicas will attempt moving replicas of an instance indicated by instanceKey below another instance. +// Orchestrator will try and figure out the best way to relocate the servers. This could span normal +// binlog-position, pseudo-gtid, repointing, binlog servers... +func RelocateReplicas(instanceKey, otherKey *InstanceKey, pattern string) (replicas [](*Instance), other *Instance, err error, errs []error) { + + instance, found, err := ReadInstance(instanceKey) + if err != nil || !found { + return replicas, other, log.Errorf("Error reading %+v", *instanceKey), errs + } + other, found, err = ReadInstance(otherKey) + if err != nil || !found { + return replicas, other, log.Errorf("Error reading %+v", *otherKey), errs + } + + replicas, err = ReadReplicaInstances(instanceKey) + if err != nil { + return replicas, other, err, errs + } + replicas = RemoveInstance(replicas, otherKey) + replicas = filterInstancesByPattern(replicas, pattern) + if len(replicas) == 0 { + // Nothing to do + return replicas, other, nil, errs + } + for _, replica := range replicas { + if other.IsDescendantOf(replica) { + return replicas, other, log.Errorf("relocate-replicas: %+v is a descendant of %+v", *otherKey, replica.Key), errs + } + } + replicas, err, errs = relocateReplicasInternal(replicas, instance, other) + + if err == nil { + AuditOperation("relocate-replicas", instanceKey, fmt.Sprintf("relocated %+v replicas of %+v below %+v", len(replicas), *instanceKey, *otherKey)) + } + return replicas, other, err, errs +} + +// PurgeBinaryLogsTo attempts to 'PURGE BINARY LOGS' until given binary log is reached +func PurgeBinaryLogsTo(instanceKey *InstanceKey, logFile string, force bool) (*Instance, error) { + replicas, err := ReadReplicaInstances(instanceKey) + if err != nil { + return nil, err + } + if !force { + purgeCoordinates := &BinlogCoordinates{LogFile: logFile, LogPos: 0} + for _, replica := range replicas { + if !purgeCoordinates.SmallerThan(&replica.ExecBinlogCoordinates) { + return nil, log.Errorf("Unsafe to purge binary logs on %+v up to %s because replica %+v has only applied up to %+v", *instanceKey, logFile, replica.Key, replica.ExecBinlogCoordinates) + } + } + } + return purgeBinaryLogsTo(instanceKey, logFile) +} + +// PurgeBinaryLogsToLatest attempts to 'PURGE BINARY LOGS' until latest binary log +func PurgeBinaryLogsToLatest(instanceKey *InstanceKey, force bool) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + return PurgeBinaryLogsTo(instanceKey, instance.SelfBinlogCoordinates.LogFile, force) +} diff --git a/go/vt/orchestrator/inst/instance_topology_dao.go b/go/vt/orchestrator/inst/instance_topology_dao.go new file mode 100644 index 0000000000..faa3b2f489 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_topology_dao.go @@ -0,0 +1,1220 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "context" + "database/sql" + "fmt" + "strings" + "time" + + "github.com/patrickmn/go-cache" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +// Max concurrency for bulk topology operations +const topologyConcurrency = 128 + +var topologyConcurrencyChan = make(chan bool, topologyConcurrency) +var supportedAutoPseudoGTIDWriters *cache.Cache = cache.New(config.CheckAutoPseudoGTIDGrantsIntervalSeconds*time.Second, time.Second) + +type OperationGTIDHint string + +const ( + GTIDHintDeny OperationGTIDHint = "NoGTID" + GTIDHintNeutral = "GTIDHintNeutral" + GTIDHintForce = "GTIDHintForce" +) + +const ( + Error1201CouldnotInitializeMasterInfoStructure = "Error 1201:" +) + +// ExecInstance executes a given query on the given MySQL topology instance +func ExecInstance(instanceKey *InstanceKey, query string, args ...interface{}) (sql.Result, error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return nil, err + } + return sqlutils.ExecNoPrepare(db, query, args...) +} + +// ExecuteOnTopology will execute given function while maintaining concurrency limit +// on topology servers. It is safe in the sense that we will not leak tokens. +func ExecuteOnTopology(f func()) { + topologyConcurrencyChan <- true + defer func() { recover(); <-topologyConcurrencyChan }() + f() +} + +// ScanInstanceRow executes a read-a-single-row query on a given MySQL topology instance +func ScanInstanceRow(instanceKey *InstanceKey, query string, dest ...interface{}) error { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return err + } + err = db.QueryRow(query).Scan(dest...) + return err +} + +// EmptyCommitInstance issues an empty COMMIT on a given instance +func EmptyCommitInstance(instanceKey *InstanceKey) error { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return err + } + tx, err := db.Begin() + if err != nil { + return err + } + err = tx.Commit() + if err != nil { + return err + } + return err +} + +// RefreshTopologyInstance will synchronuously re-read topology instance +func RefreshTopologyInstance(instanceKey *InstanceKey) (*Instance, error) { + _, err := ReadTopologyInstance(instanceKey) + if err != nil { + return nil, err + } + + inst, found, err := ReadInstance(instanceKey) + if err != nil || !found { + return nil, err + } + + return inst, nil +} + +// RefreshTopologyInstances will do a blocking (though concurrent) refresh of all given instances +func RefreshTopologyInstances(instances [](*Instance)) { + // use concurrency but wait for all to complete + barrier := make(chan InstanceKey) + for _, instance := range instances { + instance := instance + go func() { + // Signal completed replica + defer func() { barrier <- instance.Key }() + // Wait your turn to read a replica + ExecuteOnTopology(func() { + log.Debugf("... reading instance: %+v", instance.Key) + ReadTopologyInstance(&instance.Key) + }) + }() + } + for range instances { + <-barrier + } +} + +// GetReplicationRestartPreserveStatements returns a sequence of statements that make sure a replica is stopped +// and then returned to the same state. For example, if the replica was fully running, this will issue +// a STOP on both io_thread and sql_thread, followed by START on both. If one of them is not running +// at the time this function is called, said thread will be neither stopped nor started. +// The caller may provide an injected statememt, to be executed while the replica is stopped. +// This is useful for CHANGE MASTER TO commands, that unfortunately must take place while the replica +// is completely stopped. +func GetReplicationRestartPreserveStatements(instanceKey *InstanceKey, injectedStatement string) (statements []string, err error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return statements, err + } + if instance.ReplicationIOThreadRuning { + statements = append(statements, SemicolonTerminated(`stop slave io_thread`)) + } + if instance.ReplicationSQLThreadRuning { + statements = append(statements, SemicolonTerminated(`stop slave sql_thread`)) + } + if injectedStatement != "" { + statements = append(statements, SemicolonTerminated(injectedStatement)) + } + if instance.ReplicationSQLThreadRuning { + statements = append(statements, SemicolonTerminated(`start slave sql_thread`)) + } + if instance.ReplicationIOThreadRuning { + statements = append(statements, SemicolonTerminated(`start slave io_thread`)) + } + return statements, err +} + +// FlushBinaryLogs attempts a 'FLUSH BINARY LOGS' statement on the given instance. +func FlushBinaryLogs(instanceKey *InstanceKey, count int) (*Instance, error) { + if *config.RuntimeCLIFlags.Noop { + return nil, fmt.Errorf("noop: aborting flush-binary-logs operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + for i := 0; i < count; i++ { + _, err := ExecInstance(instanceKey, `flush binary logs`) + if err != nil { + return nil, log.Errore(err) + } + } + + log.Infof("flush-binary-logs count=%+v on %+v", count, *instanceKey) + AuditOperation("flush-binary-logs", instanceKey, "success") + + return ReadTopologyInstance(instanceKey) +} + +// FlushBinaryLogsTo attempts to 'FLUSH BINARY LOGS' until given binary log is reached +func FlushBinaryLogsTo(instanceKey *InstanceKey, logFile string) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + distance := instance.SelfBinlogCoordinates.FileNumberDistance(&BinlogCoordinates{LogFile: logFile}) + if distance < 0 { + return nil, log.Errorf("FlushBinaryLogsTo: target log file %+v is smaller than current log file %+v", logFile, instance.SelfBinlogCoordinates.LogFile) + } + return FlushBinaryLogs(instanceKey, distance) +} + +// purgeBinaryLogsTo attempts to 'PURGE BINARY LOGS' until given binary log is reached +func purgeBinaryLogsTo(instanceKey *InstanceKey, logFile string) (*Instance, error) { + if *config.RuntimeCLIFlags.Noop { + return nil, fmt.Errorf("noop: aborting purge-binary-logs operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + _, err := ExecInstance(instanceKey, "purge binary logs to ?", logFile) + if err != nil { + return nil, log.Errore(err) + } + + log.Infof("purge-binary-logs to=%+v on %+v", logFile, *instanceKey) + AuditOperation("purge-binary-logs", instanceKey, "success") + + return ReadTopologyInstance(instanceKey) +} + +func SetSemiSyncMaster(instanceKey *InstanceKey, enableMaster bool) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if _, err := ExecInstance(instanceKey, "set @@global.rpl_semi_sync_master_enabled=?", enableMaster); err != nil { + return instance, log.Errore(err) + } + return ReadTopologyInstance(instanceKey) +} + +func SetSemiSyncReplica(instanceKey *InstanceKey, enableReplica bool) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, err + } + if instance.SemiSyncReplicaEnabled == enableReplica { + return instance, nil + } + if _, err := ExecInstance(instanceKey, "set @@global.rpl_semi_sync_slave_enabled=?", enableReplica); err != nil { + return instance, log.Errore(err) + } + if instance.ReplicationIOThreadRuning { + // Need to apply change by stopping starting IO thread + ExecInstance(instanceKey, "stop slave io_thread") + if _, err := ExecInstance(instanceKey, "start slave io_thread"); err != nil { + return instance, log.Errore(err) + } + } + return ReadTopologyInstance(instanceKey) +} + +func RestartReplicationQuick(instanceKey *InstanceKey) error { + for _, cmd := range []string{`stop slave sql_thread`, `stop slave io_thread`, `start slave io_thread`, `start slave sql_thread`} { + if _, err := ExecInstance(instanceKey, cmd); err != nil { + return log.Errorf("%+v: RestartReplicationQuick: '%q' failed: %+v", *instanceKey, cmd, err) + } else { + log.Infof("%s on %+v as part of RestartReplicationQuick", cmd, *instanceKey) + } + } + return nil +} + +// StopReplicationNicely stops a replica such that SQL_thread and IO_thread are aligned (i.e. +// SQL_thread consumes all relay log entries) +// It will actually START the sql_thread even if the replica is completely stopped. +func StopReplicationNicely(instanceKey *InstanceKey, timeout time.Duration) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if !instance.ReplicationThreadsExist() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + + // stop io_thread, start sql_thread but catch any errors + for _, cmd := range []string{`stop slave io_thread`, `start slave sql_thread`} { + if _, err := ExecInstance(instanceKey, cmd); err != nil { + return nil, log.Errorf("%+v: StopReplicationNicely: '%q' failed: %+v", *instanceKey, cmd, err) + } + } + + if instance.SQLDelay == 0 { + // Otherwise we don't bother. + if instance, err = WaitForSQLThreadUpToDate(instanceKey, timeout, 0); err != nil { + return instance, err + } + } + + _, err = ExecInstance(instanceKey, `stop slave`) + if err != nil { + // Patch; current MaxScale behavior for STOP SLAVE is to throw an error if replica already stopped. + if instance.isMaxScale() && err.Error() == "Error 1199: Slave connection is not running" { + err = nil + } + } + if err != nil { + return instance, log.Errore(err) + } + + instance, err = ReadTopologyInstance(instanceKey) + log.Infof("Stopped replication nicely on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) + return instance, err +} + +func WaitForSQLThreadUpToDate(instanceKey *InstanceKey, overallTimeout time.Duration, staleCoordinatesTimeout time.Duration) (instance *Instance, err error) { + // Otherwise we don't bother. + var lastExecBinlogCoordinates BinlogCoordinates + + if overallTimeout == 0 { + overallTimeout = 24 * time.Hour + } + if staleCoordinatesTimeout == 0 { + staleCoordinatesTimeout = time.Duration(config.Config.ReasonableReplicationLagSeconds) * time.Second + } + generalTimer := time.NewTimer(overallTimeout) + staleTimer := time.NewTimer(staleCoordinatesTimeout) + for { + instance, err := RetryInstanceFunction(func() (*Instance, error) { + return ReadTopologyInstance(instanceKey) + }) + if err != nil { + return instance, log.Errore(err) + } + + if instance.SQLThreadUpToDate() { + // Woohoo + return instance, nil + } + if instance.SQLDelay != 0 { + return instance, log.Errorf("WaitForSQLThreadUpToDate: instance %+v has SQL Delay %+v. Operation is irrelevant", *instanceKey, instance.SQLDelay) + } + + if !instance.ExecBinlogCoordinates.Equals(&lastExecBinlogCoordinates) { + // means we managed to apply binlog events. We made progress... + // so we reset the "staleness" timer + if !staleTimer.Stop() { + <-staleTimer.C + } + staleTimer.Reset(staleCoordinatesTimeout) + } + lastExecBinlogCoordinates = instance.ExecBinlogCoordinates + + select { + case <-generalTimer.C: + return instance, log.Errorf("WaitForSQLThreadUpToDate timeout on %+v after duration %+v", *instanceKey, overallTimeout) + case <-staleTimer.C: + return instance, log.Errorf("WaitForSQLThreadUpToDate stale coordinates timeout on %+v after duration %+v", *instanceKey, staleCoordinatesTimeout) + default: + log.Debugf("WaitForSQLThreadUpToDate waiting on %+v", *instanceKey) + time.Sleep(retryInterval) + } + } +} + +// StopReplicas will stop replication concurrently on given set of replicas. +// It will potentially do nothing, or attempt to stop _nicely_ or just stop normally, all according to stopReplicationMethod +func StopReplicas(replicas [](*Instance), stopReplicationMethod StopReplicationMethod, timeout time.Duration) [](*Instance) { + if stopReplicationMethod == NoStopReplication { + return replicas + } + refreshedReplicas := [](*Instance){} + + log.Debugf("Stopping %d replicas via %s", len(replicas), string(stopReplicationMethod)) + // use concurrency but wait for all to complete + barrier := make(chan *Instance) + for _, replica := range replicas { + replica := replica + go func() { + updatedReplica := &replica + // Signal completed replica + defer func() { barrier <- *updatedReplica }() + // Wait your turn to read a replica + ExecuteOnTopology(func() { + if stopReplicationMethod == StopReplicationNice { + StopReplicationNicely(&replica.Key, timeout) + } + replica, _ = StopReplication(&replica.Key) + updatedReplica = &replica + }) + }() + } + for range replicas { + refreshedReplicas = append(refreshedReplicas, <-barrier) + } + return refreshedReplicas +} + +// StopReplicasNicely will attemt to stop all given replicas nicely, up to timeout +func StopReplicasNicely(replicas [](*Instance), timeout time.Duration) [](*Instance) { + return StopReplicas(replicas, StopReplicationNice, timeout) +} + +// StopReplication stops replication on a given instance +func StopReplication(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + _, err = ExecInstance(instanceKey, `stop slave`) + if err != nil { + // Patch; current MaxScale behavior for STOP SLAVE is to throw an error if replica already stopped. + if instance.isMaxScale() && err.Error() == "Error 1199: Slave connection is not running" { + err = nil + } + } + if err != nil { + return instance, log.Errore(err) + } + instance, err = ReadTopologyInstance(instanceKey) + + log.Infof("Stopped replication on %+v, Self:%+v, Exec:%+v", *instanceKey, instance.SelfBinlogCoordinates, instance.ExecBinlogCoordinates) + return instance, err +} + +// waitForReplicationState waits for both replication threads to be either running or not running, together. +// This is useful post- `start slave` operation, ensuring both threads are actually running, +// or post `stop slave` operation, ensuring both threads are not running. +func waitForReplicationState(instanceKey *InstanceKey, expectedState ReplicationThreadState) (expectationMet bool, err error) { + waitDuration := time.Second + waitInterval := 10 * time.Millisecond + startTime := time.Now() + + for { + // Since this is an incremental aggressive polling, it's OK if an occasional + // error is observed. We don't bail out on a single error. + if expectationMet, _ := expectReplicationThreadsState(instanceKey, expectedState); expectationMet { + return true, nil + } + if time.Since(startTime)+waitInterval > waitDuration { + break + } + time.Sleep(waitInterval) + waitInterval = 2 * waitInterval + } + return false, nil +} + +// StartReplication starts replication on a given instance. +func StartReplication(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + + // If async fallback is disallowed, we'd better make sure to enable replicas to + // send ACKs before START SLAVE. Replica ACKing is off at mysqld startup because + // some replicas (those that must never be promoted) should never ACK. + // Note: We assume that replicas use 'skip-slave-start' so they won't + // START SLAVE on their own upon restart. + if instance.SemiSyncEnforced { + // Send ACK only from promotable instances. + sendACK := instance.PromotionRule != MustNotPromoteRule + // Always disable master setting, in case we're converting a former master. + if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { + return instance, log.Errore(err) + } + } + + _, err = ExecInstance(instanceKey, `start slave`) + if err != nil { + return instance, log.Errore(err) + } + log.Infof("Started replication on %+v", instanceKey) + + waitForReplicationState(instanceKey, ReplicationThreadStateRunning) + + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + if !instance.ReplicaRunning() { + return instance, ReplicationNotRunningError + } + return instance, nil +} + +// RestartReplication stops & starts replication on a given instance +func RestartReplication(instanceKey *InstanceKey) (instance *Instance, err error) { + instance, err = StopReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + instance, err = StartReplication(instanceKey) + return instance, log.Errore(err) +} + +// StartReplicas will do concurrent start-replica +func StartReplicas(replicas [](*Instance)) { + // use concurrency but wait for all to complete + log.Debugf("Starting %d replicas", len(replicas)) + barrier := make(chan InstanceKey) + for _, instance := range replicas { + instance := instance + go func() { + // Signal compelted replica + defer func() { barrier <- instance.Key }() + // Wait your turn to read a replica + ExecuteOnTopology(func() { StartReplication(&instance.Key) }) + }() + } + for range replicas { + <-barrier + } +} + +func WaitForExecBinlogCoordinatesToReach(instanceKey *InstanceKey, coordinates *BinlogCoordinates, maxWait time.Duration) (instance *Instance, exactMatch bool, err error) { + startTime := time.Now() + for { + if maxWait != 0 && time.Since(startTime) > maxWait { + return nil, exactMatch, fmt.Errorf("WaitForExecBinlogCoordinatesToReach: reached maxWait %+v on %+v", maxWait, *instanceKey) + } + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, exactMatch, log.Errore(err) + } + + switch { + case instance.ExecBinlogCoordinates.SmallerThan(coordinates): + time.Sleep(retryInterval) + case instance.ExecBinlogCoordinates.Equals(coordinates): + return instance, true, nil + case coordinates.SmallerThan(&instance.ExecBinlogCoordinates): + return instance, false, nil + } + } + return instance, exactMatch, err +} + +// StartReplicationUntilMasterCoordinates issuesa START SLAVE UNTIL... statement on given instance +func StartReplicationUntilMasterCoordinates(instanceKey *InstanceKey, masterCoordinates *BinlogCoordinates) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + if !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("replication threads are not stopped: %+v", instanceKey) + } + + log.Infof("Will start replication on %+v until coordinates: %+v", instanceKey, masterCoordinates) + + if instance.SemiSyncEnforced { + // Send ACK only from promotable instances. + sendACK := instance.PromotionRule != MustNotPromoteRule + // Always disable master setting, in case we're converting a former master. + if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { + return instance, log.Errore(err) + } + } + + // MariaDB has a bug: a CHANGE MASTER TO statement does not work properly with prepared statement... :P + // See https://mariadb.atlassian.net/browse/MDEV-7640 + // This is the reason for ExecInstance + _, err = ExecInstance(instanceKey, "start slave until master_log_file=?, master_log_pos=?", + masterCoordinates.LogFile, masterCoordinates.LogPos) + if err != nil { + return instance, log.Errore(err) + } + + instance, exactMatch, err := WaitForExecBinlogCoordinatesToReach(instanceKey, masterCoordinates, 0) + if err != nil { + return instance, log.Errore(err) + } + if !exactMatch { + return instance, fmt.Errorf("Start SLAVE UNTIL is past coordinates: %+v", instanceKey) + } + + instance, err = StopReplication(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + return instance, err +} + +// EnableSemiSync sets the rpl_semi_sync_(master|replica)_enabled variables +// on a given instance. +func EnableSemiSync(instanceKey *InstanceKey, master, replica bool) error { + log.Infof("instance %+v rpl_semi_sync_master_enabled: %t, rpl_semi_sync_slave_enabled: %t", instanceKey, master, replica) + _, err := ExecInstance(instanceKey, + `set global rpl_semi_sync_master_enabled = ?, global rpl_semi_sync_slave_enabled = ?`, + master, replica) + return err +} + +// ChangeMasterCredentials issues a CHANGE MASTER TO... MASTER_USER=, MASTER_PASSWORD=... +func ChangeMasterCredentials(instanceKey *InstanceKey, masterUser string, masterPassword string) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + if masterUser == "" { + return instance, log.Errorf("Empty user in ChangeMasterCredentials() for %+v", *instanceKey) + } + + if instance.ReplicationThreadsExist() && !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("ChangeMasterTo: Cannot change master on: %+v because replication is running", *instanceKey) + } + log.Debugf("ChangeMasterTo: will attempt changing master credentials on %+v", *instanceKey) + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting CHANGE MASTER TO operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + _, err = ExecInstance(instanceKey, "change master to master_user=?, master_password=?", + masterUser, masterPassword) + + if err != nil { + return instance, log.Errore(err) + } + + log.Infof("ChangeMasterTo: Changed master credentials on %+v", *instanceKey) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// EnableMasterSSL issues CHANGE MASTER TO MASTER_SSL=1 +func EnableMasterSSL(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if instance.ReplicationThreadsExist() && !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("EnableMasterSSL: Cannot enable SSL replication on %+v because replication threads are not stopped", *instanceKey) + } + log.Debugf("EnableMasterSSL: Will attempt enabling SSL replication on %+v", *instanceKey) + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting CHANGE MASTER TO MASTER_SSL=1 operation on %+v; signaling error but nothing went wrong.", *instanceKey) + } + _, err = ExecInstance(instanceKey, "change master to master_ssl=1") + + if err != nil { + return instance, log.Errore(err) + } + + log.Infof("EnableMasterSSL: Enabled SSL replication on %+v", *instanceKey) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// See https://bugs.mysql.com/bug.php?id=83713 +func workaroundBug83713(instanceKey *InstanceKey) { + log.Debugf("workaroundBug83713: %+v", *instanceKey) + queries := []string{ + `reset slave`, + `start slave IO_THREAD`, + `stop slave IO_THREAD`, + `reset slave`, + } + for _, query := range queries { + if _, err := ExecInstance(instanceKey, query); err != nil { + log.Debugf("workaroundBug83713: error on %s: %+v", query, err) + } + } +} + +// ChangeMasterTo changes the given instance's master according to given input. +func ChangeMasterTo(instanceKey *InstanceKey, masterKey *InstanceKey, masterBinlogCoordinates *BinlogCoordinates, skipUnresolve bool, gtidHint OperationGTIDHint) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if instance.ReplicationThreadsExist() && !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("ChangeMasterTo: Cannot change master on: %+v because replication threads are not stopped", *instanceKey) + } + log.Debugf("ChangeMasterTo: will attempt changing master on %+v to %+v, %+v", *instanceKey, *masterKey, *masterBinlogCoordinates) + changeToMasterKey := masterKey + if !skipUnresolve { + unresolvedMasterKey, nameUnresolved, err := UnresolveHostname(masterKey) + if err != nil { + log.Debugf("ChangeMasterTo: aborting operation on %+v due to resolving error on %+v: %+v", *instanceKey, *masterKey, err) + return instance, err + } + if nameUnresolved { + log.Debugf("ChangeMasterTo: Unresolved %+v into %+v", *masterKey, unresolvedMasterKey) + } + changeToMasterKey = &unresolvedMasterKey + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting CHANGE MASTER TO operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + originalMasterKey := instance.MasterKey + originalExecBinlogCoordinates := instance.ExecBinlogCoordinates + + var changeMasterFunc func() error + changedViaGTID := false + if instance.UsingMariaDBGTID && gtidHint != GTIDHintDeny { + // Keep on using GTID + changeMasterFunc = func() error { + _, err := ExecInstance(instanceKey, "change master to master_host=?, master_port=?", + changeToMasterKey.Hostname, changeToMasterKey.Port) + return err + } + changedViaGTID = true + } else if instance.UsingMariaDBGTID && gtidHint == GTIDHintDeny { + // Make sure to not use GTID + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, "change master to master_host=?, master_port=?, master_log_file=?, master_log_pos=?, master_use_gtid=no", + changeToMasterKey.Hostname, changeToMasterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos) + return err + } + } else if instance.IsMariaDB() && gtidHint == GTIDHintForce { + // Is MariaDB; not using GTID, turn into GTID + mariadbGTIDHint := "slave_pos" + if !instance.ReplicationThreadsExist() { + // This instance is currently a master. As per https://mariadb.com/kb/en/change-master-to/#master_use_gtid + // we should be using current_pos. + // See also: + // - https://github.com/openark/orchestrator/issues/1146 + // - https://dba.stackexchange.com/a/234323 + mariadbGTIDHint = "current_pos" + } + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, fmt.Sprintf("change master to master_host=?, master_port=?, master_use_gtid=%s", mariadbGTIDHint), + changeToMasterKey.Hostname, changeToMasterKey.Port) + return err + } + changedViaGTID = true + } else if instance.UsingOracleGTID && gtidHint != GTIDHintDeny { + // Is Oracle; already uses GTID; keep using it. + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, "change master to master_host=?, master_port=?", + changeToMasterKey.Hostname, changeToMasterKey.Port) + return err + } + changedViaGTID = true + } else if instance.UsingOracleGTID && gtidHint == GTIDHintDeny { + // Is Oracle; already uses GTID + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, "change master to master_host=?, master_port=?, master_log_file=?, master_log_pos=?, master_auto_position=0", + changeToMasterKey.Hostname, changeToMasterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos) + return err + } + } else if instance.SupportsOracleGTID && gtidHint == GTIDHintForce { + // Is Oracle; not using GTID right now; turn into GTID + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, "change master to master_host=?, master_port=?, master_auto_position=1", + changeToMasterKey.Hostname, changeToMasterKey.Port) + return err + } + changedViaGTID = true + } else { + // Normal binlog file:pos + changeMasterFunc = func() error { + _, err = ExecInstance(instanceKey, "change master to master_host=?, master_port=?, master_log_file=?, master_log_pos=?", + changeToMasterKey.Hostname, changeToMasterKey.Port, masterBinlogCoordinates.LogFile, masterBinlogCoordinates.LogPos) + return err + } + } + err = changeMasterFunc() + if err != nil && instance.UsingOracleGTID && strings.Contains(err.Error(), Error1201CouldnotInitializeMasterInfoStructure) { + log.Debugf("ChangeMasterTo: got %+v", err) + workaroundBug83713(instanceKey) + err = changeMasterFunc() + } + if err != nil { + return instance, log.Errore(err) + } + WriteMasterPositionEquivalence(&originalMasterKey, &originalExecBinlogCoordinates, changeToMasterKey, masterBinlogCoordinates) + ResetInstanceRelaylogCoordinatesHistory(instanceKey) + + log.Infof("ChangeMasterTo: Changed master on %+v to: %+v, %+v. GTID: %+v", *instanceKey, masterKey, masterBinlogCoordinates, changedViaGTID) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// SkipToNextBinaryLog changes master position to beginning of next binlog +// USE WITH CARE! +// Use case is binlog servers where the master was gone & replaced by another. +func SkipToNextBinaryLog(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + nextFileCoordinates, err := instance.ExecBinlogCoordinates.NextFileCoordinates() + if err != nil { + return instance, log.Errore(err) + } + nextFileCoordinates.LogPos = 4 + log.Debugf("Will skip replication on %+v to next binary log: %+v", instance.Key, nextFileCoordinates.LogFile) + + instance, err = ChangeMasterTo(&instance.Key, &instance.MasterKey, &nextFileCoordinates, false, GTIDHintNeutral) + if err != nil { + return instance, log.Errore(err) + } + AuditOperation("skip-binlog", instanceKey, fmt.Sprintf("Skipped replication to next binary log: %+v", nextFileCoordinates.LogFile)) + return StartReplication(instanceKey) +} + +// ResetReplication resets a replica, breaking the replication +func ResetReplication(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if instance.ReplicationThreadsExist() && !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("Cannot reset replication on: %+v because replication threads are not stopped", instanceKey) + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting reset-replication operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + // MySQL's RESET SLAVE is done correctly; however SHOW SLAVE STATUS still returns old hostnames etc + // and only resets till after next restart. This leads to orchestrator still thinking the instance replicates + // from old host. We therefore forcibly modify the hostname. + // RESET SLAVE ALL command solves this, but only as of 5.6.3 + _, err = ExecInstance(instanceKey, `change master to master_host='_'`) + if err != nil { + return instance, log.Errore(err) + } + _, err = ExecInstance(instanceKey, `reset slave /*!50603 all */`) + if err != nil && strings.Contains(err.Error(), Error1201CouldnotInitializeMasterInfoStructure) { + log.Debugf("ResetReplication: got %+v", err) + workaroundBug83713(instanceKey) + _, err = ExecInstance(instanceKey, `reset slave /*!50603 all */`) + } + if err != nil { + return instance, log.Errore(err) + } + log.Infof("Reset replication %+v", instanceKey) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// ResetMaster issues a RESET MASTER statement on given instance. Use with extreme care! +func ResetMaster(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if instance.ReplicationThreadsExist() && !instance.ReplicationThreadsStopped() { + return instance, fmt.Errorf("Cannot reset master on: %+v because replication threads are not stopped", instanceKey) + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting reset-master operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + _, err = ExecInstance(instanceKey, `reset master`) + if err != nil { + return instance, log.Errore(err) + } + log.Infof("Reset master %+v", instanceKey) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// skipQueryClassic skips a query in normal binlog file:pos replication +func setGTIDPurged(instance *Instance, gtidPurged string) error { + if *config.RuntimeCLIFlags.Noop { + return fmt.Errorf("noop: aborting set-gtid-purged operation on %+v; signalling error but nothing went wrong.", instance.Key) + } + + _, err := ExecInstance(&instance.Key, `set global gtid_purged := ?`, gtidPurged) + return err +} + +// injectEmptyGTIDTransaction +func injectEmptyGTIDTransaction(instanceKey *InstanceKey, gtidEntry *OracleGtidSetEntry) error { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return err + } + ctx := context.Background() + conn, err := db.Conn(ctx) + if err != nil { + return err + } + defer conn.Close() + + if _, err := conn.ExecContext(ctx, fmt.Sprintf(`SET GTID_NEXT="%s"`, gtidEntry.String())); err != nil { + return err + } + tx, err := conn.BeginTx(ctx, &sql.TxOptions{}) + if err != nil { + return err + } + if err := tx.Commit(); err != nil { + return err + } + if _, err := conn.ExecContext(ctx, `SET GTID_NEXT="AUTOMATIC"`); err != nil { + return err + } + return nil +} + +// skipQueryClassic skips a query in normal binlog file:pos replication +func skipQueryClassic(instance *Instance) error { + _, err := ExecInstance(&instance.Key, `set global sql_slave_skip_counter := 1`) + return err +} + +// skipQueryOracleGtid skips a single query in an Oracle GTID replicating replica, by injecting an empty transaction +func skipQueryOracleGtid(instance *Instance) error { + nextGtid, err := instance.NextGTID() + if err != nil { + return err + } + if nextGtid == "" { + return fmt.Errorf("Empty NextGTID() in skipQueryGtid() for %+v", instance.Key) + } + if _, err := ExecInstance(&instance.Key, `SET GTID_NEXT=?`, nextGtid); err != nil { + return err + } + if err := EmptyCommitInstance(&instance.Key); err != nil { + return err + } + if _, err := ExecInstance(&instance.Key, `SET GTID_NEXT='AUTOMATIC'`); err != nil { + return err + } + return nil +} + +// SkipQuery skip a single query in a failed replication instance +func SkipQuery(instanceKey *InstanceKey) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if !instance.IsReplica() { + return instance, fmt.Errorf("instance is not a replica: %+v", instanceKey) + } + if instance.ReplicationSQLThreadRuning { + return instance, fmt.Errorf("Replication SQL thread is running on %+v", instanceKey) + } + if instance.LastSQLError == "" { + return instance, fmt.Errorf("No SQL error on %+v", instanceKey) + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting skip-query operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + log.Debugf("Skipping one query on %+v", instanceKey) + if instance.UsingOracleGTID { + err = skipQueryOracleGtid(instance) + } else if instance.UsingMariaDBGTID { + return instance, log.Errorf("%+v is replicating with MariaDB GTID. To skip a query first disable GTID, then skip, then enable GTID again", *instanceKey) + } else { + err = skipQueryClassic(instance) + } + if err != nil { + return instance, log.Errore(err) + } + AuditOperation("skip-query", instanceKey, "Skipped one query") + return StartReplication(instanceKey) +} + +// MasterPosWait issues a MASTER_POS_WAIT() an given instance according to given coordinates. +func MasterPosWait(instanceKey *InstanceKey, binlogCoordinates *BinlogCoordinates) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + _, err = ExecInstance(instanceKey, `select master_pos_wait(?, ?)`, binlogCoordinates.LogFile, binlogCoordinates.LogPos) + if err != nil { + return instance, log.Errore(err) + } + log.Infof("Instance %+v has reached coordinates: %+v", instanceKey, binlogCoordinates) + + instance, err = ReadTopologyInstance(instanceKey) + return instance, err +} + +// Attempt to read and return replication credentials from the mysql.slave_master_info system table +func ReadReplicationCredentials(instanceKey *InstanceKey) (replicationUser string, replicationPassword string, err error) { + if config.Config.ReplicationCredentialsQuery != "" { + err = ScanInstanceRow(instanceKey, config.Config.ReplicationCredentialsQuery, &replicationUser, &replicationPassword) + if err == nil && replicationUser == "" { + err = fmt.Errorf("Empty username retrieved by ReplicationCredentialsQuery") + } + if err == nil { + return replicationUser, replicationPassword, nil + } + log.Errore(err) + } + // Didn't get credentials from ReplicationCredentialsQuery, or ReplicationCredentialsQuery doesn't exist in the first place? + // We brute force our way through mysql.slave_master_info + { + query := ` + select + ifnull(max(User_name), '') as user, + ifnull(max(User_password), '') as password + from + mysql.slave_master_info + ` + err = ScanInstanceRow(instanceKey, query, &replicationUser, &replicationPassword) + if err == nil && replicationUser == "" { + err = fmt.Errorf("Empty username found in mysql.slave_master_info") + } + } + return replicationUser, replicationPassword, log.Errore(err) +} + +// SetReadOnly sets or clears the instance's global read_only variable +func SetReadOnly(instanceKey *InstanceKey, readOnly bool) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting set-read-only operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + // If async fallback is disallowed, we're responsible for flipping the master + // semi-sync switch ON before accepting writes. The setting is off by default. + if instance.SemiSyncEnforced && !readOnly { + // Send ACK only from promotable instances. + sendACK := instance.PromotionRule != MustNotPromoteRule + if err := EnableSemiSync(instanceKey, true, sendACK); err != nil { + return instance, log.Errore(err) + } + } + + if _, err := ExecInstance(instanceKey, "set global read_only = ?", readOnly); err != nil { + return instance, log.Errore(err) + } + if config.Config.UseSuperReadOnly { + if _, err := ExecInstance(instanceKey, "set global super_read_only = ?", readOnly); err != nil { + // We don't bail out here. super_read_only is only available on + // MySQL 5.7.8 and Percona Server 5.6.21-70 + // At this time orchestrator does not verify whether a server supports super_read_only or not. + // It makes a best effort to set it. + log.Errore(err) + } + } + instance, err = ReadTopologyInstance(instanceKey) + + // If we just went read-only, it's safe to flip the master semi-sync switch + // OFF, which is the default value so that replicas can make progress. + if instance.SemiSyncEnforced && readOnly { + // Send ACK only from promotable instances. + sendACK := instance.PromotionRule != MustNotPromoteRule + if err := EnableSemiSync(instanceKey, false, sendACK); err != nil { + return instance, log.Errore(err) + } + } + + log.Infof("instance %+v read_only: %t", instanceKey, readOnly) + AuditOperation("read-only", instanceKey, fmt.Sprintf("set as %t", readOnly)) + + return instance, err +} + +// KillQuery stops replication on a given instance +func KillQuery(instanceKey *InstanceKey, process int64) (*Instance, error) { + instance, err := ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + if *config.RuntimeCLIFlags.Noop { + return instance, fmt.Errorf("noop: aborting kill-query operation on %+v; signalling error but nothing went wrong.", *instanceKey) + } + + _, err = ExecInstance(instanceKey, `kill query ?`, process) + if err != nil { + return instance, log.Errore(err) + } + + instance, err = ReadTopologyInstance(instanceKey) + if err != nil { + return instance, log.Errore(err) + } + + log.Infof("Killed query on %+v", *instanceKey) + AuditOperation("kill-query", instanceKey, fmt.Sprintf("Killed query %d", process)) + return instance, err +} + +// injectPseudoGTID injects a Pseudo-GTID statement on a writable instance +func injectPseudoGTID(instance *Instance) (hint string, err error) { + if *config.RuntimeCLIFlags.Noop { + return hint, fmt.Errorf("noop: aborting inject-pseudo-gtid operation on %+v; signalling error but nothing went wrong.", instance.Key) + } + + now := time.Now() + randomHash := util.RandomHash()[0:16] + hint = fmt.Sprintf("%.8x:%.8x:%s", now.Unix(), instance.ServerID, randomHash) + query := fmt.Sprintf("drop view if exists `%s`.`_asc:%s`", config.PseudoGTIDSchema, hint) + _, err = ExecInstance(&instance.Key, query) + return hint, log.Errore(err) +} + +// canInjectPseudoGTID checks orchestrator's grants to determine whether is has the +// privilege of auto-injecting pseudo-GTID +func canInjectPseudoGTID(instanceKey *InstanceKey) (canInject bool, err error) { + if canInject, found := supportedAutoPseudoGTIDWriters.Get(instanceKey.StringCode()); found { + return canInject.(bool), nil + } + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return canInject, err + } + + foundAll := false + foundDropOnAll := false + foundAllOnSchema := false + foundDropOnSchema := false + + err = sqlutils.QueryRowsMap(db, `show grants for current_user()`, func(m sqlutils.RowMap) error { + for _, grantData := range m { + grant := grantData.String + if strings.Contains(grant, `GRANT ALL PRIVILEGES ON *.*`) { + foundAll = true + } + if strings.Contains(grant, `DROP`) && strings.Contains(grant, ` ON *.*`) { + foundDropOnAll = true + } + if strings.Contains(grant, fmt.Sprintf("GRANT ALL PRIVILEGES ON `%s`.*", config.PseudoGTIDSchema)) { + foundAllOnSchema = true + } + if strings.Contains(grant, fmt.Sprintf(`GRANT ALL PRIVILEGES ON "%s".*`, config.PseudoGTIDSchema)) { + foundAllOnSchema = true + } + if strings.Contains(grant, `DROP`) && strings.Contains(grant, fmt.Sprintf(" ON `%s`.*", config.PseudoGTIDSchema)) { + foundDropOnSchema = true + } + if strings.Contains(grant, `DROP`) && strings.Contains(grant, fmt.Sprintf(` ON "%s".*`, config.PseudoGTIDSchema)) { + foundDropOnSchema = true + } + } + return nil + }) + if err != nil { + return canInject, err + } + + canInject = foundAll || foundDropOnAll || foundAllOnSchema || foundDropOnSchema + supportedAutoPseudoGTIDWriters.Set(instanceKey.StringCode(), canInject, cache.DefaultExpiration) + + return canInject, nil +} + +// CheckAndInjectPseudoGTIDOnWriter checks whether pseudo-GTID can and +// should be injected on given instance, and if so, attempts to inject. +func CheckAndInjectPseudoGTIDOnWriter(instance *Instance) (injected bool, err error) { + if instance == nil { + return injected, log.Errorf("CheckAndInjectPseudoGTIDOnWriter: instance is nil") + } + if instance.ReadOnly { + return injected, log.Errorf("CheckAndInjectPseudoGTIDOnWriter: instance is read-only: %+v", instance.Key) + } + if !instance.IsLastCheckValid { + return injected, nil + } + canInject, err := canInjectPseudoGTID(&instance.Key) + if err != nil { + return injected, log.Errore(err) + } + if !canInject { + if util.ClearToLog("CheckAndInjectPseudoGTIDOnWriter", instance.Key.StringCode()) { + log.Warningf("AutoPseudoGTID enabled, but orchestrator has no priviliges on %+v to inject pseudo-gtid", instance.Key) + } + + return injected, nil + } + if _, err := injectPseudoGTID(instance); err != nil { + return injected, log.Errore(err) + } + injected = true + if err := RegisterInjectedPseudoGTID(instance.ClusterName); err != nil { + return injected, log.Errore(err) + } + return injected, nil +} + +func GTIDSubtract(instanceKey *InstanceKey, gtidSet string, gtidSubset string) (gtidSubtract string, err error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return gtidSubtract, err + } + err = db.QueryRow("select gtid_subtract(?, ?)", gtidSet, gtidSubset).Scan(>idSubtract) + return gtidSubtract, err +} + +func ShowMasterStatus(instanceKey *InstanceKey) (masterStatusFound bool, executedGtidSet string, err error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return masterStatusFound, executedGtidSet, err + } + err = sqlutils.QueryRowsMap(db, "show master status", func(m sqlutils.RowMap) error { + masterStatusFound = true + executedGtidSet = m.GetStringD("Executed_Gtid_Set", "") + return nil + }) + return masterStatusFound, executedGtidSet, err +} + +func ShowBinaryLogs(instanceKey *InstanceKey) (binlogs []string, err error) { + db, err := db.OpenTopology(instanceKey.Hostname, instanceKey.Port) + if err != nil { + return binlogs, err + } + err = sqlutils.QueryRowsMap(db, "show binary logs", func(m sqlutils.RowMap) error { + binlogs = append(binlogs, m.GetString("Log_name")) + return nil + }) + return binlogs, err +} diff --git a/go/vt/orchestrator/inst/instance_topology_test.go b/go/vt/orchestrator/inst/instance_topology_test.go new file mode 100644 index 0000000000..ab2f2f0455 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_topology_test.go @@ -0,0 +1,571 @@ +package inst + +import ( + "math/rand" + + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +var ( + i710Key = InstanceKey{Hostname: "i710", Port: 3306} + i720Key = InstanceKey{Hostname: "i720", Port: 3306} + i730Key = InstanceKey{Hostname: "i730", Port: 3306} + i810Key = InstanceKey{Hostname: "i810", Port: 3306} + i820Key = InstanceKey{Hostname: "i820", Port: 3306} + i830Key = InstanceKey{Hostname: "i830", Port: 3306} +) + +func init() { + config.Config.HostnameResolveMethod = "none" + config.MarkConfigurationLoaded() + log.SetLevel(log.ERROR) +} + +func generateTestInstances() (instances [](*Instance), instancesMap map[string](*Instance)) { + i710 := Instance{Key: i710Key, ServerID: 710, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 10}} + i720 := Instance{Key: i720Key, ServerID: 720, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 20}} + i730 := Instance{Key: i730Key, ServerID: 730, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000007", LogPos: 30}} + i810 := Instance{Key: i810Key, ServerID: 810, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000008", LogPos: 10}} + i820 := Instance{Key: i820Key, ServerID: 820, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000008", LogPos: 20}} + i830 := Instance{Key: i830Key, ServerID: 830, ExecBinlogCoordinates: BinlogCoordinates{LogFile: "mysql.000008", LogPos: 30}} + instances = [](*Instance){&i710, &i720, &i730, &i810, &i820, &i830} + for _, instance := range instances { + instance.Version = "5.6.7" + instance.Binlog_format = "STATEMENT" + } + instancesMap = make(map[string](*Instance)) + for _, instance := range instances { + instancesMap[instance.Key.StringCode()] = instance + } + return instances, instancesMap +} + +func applyGeneralGoodToGoReplicationParams(instances [](*Instance)) { + for _, instance := range instances { + instance.IsLastCheckValid = true + instance.LogBinEnabled = true + instance.LogReplicationUpdatesEnabled = true + } +} + +func TestInitial(t *testing.T) { + test.S(t).ExpectTrue(true) +} + +func TestSortInstances(t *testing.T) { + instances, _ := generateTestInstances() + sortInstances(instances) + test.S(t).ExpectEquals(instances[0].Key, i830Key) + test.S(t).ExpectEquals(instances[1].Key, i820Key) + test.S(t).ExpectEquals(instances[2].Key, i810Key) + test.S(t).ExpectEquals(instances[3].Key, i730Key) + test.S(t).ExpectEquals(instances[4].Key, i720Key) + test.S(t).ExpectEquals(instances[5].Key, i710Key) +} + +func TestSortInstancesSameCoordinatesDifferingBinlogFormats(t *testing.T) { + instances, instancesMap := generateTestInstances() + for _, instance := range instances { + instance.ExecBinlogCoordinates = instances[0].ExecBinlogCoordinates + instance.Binlog_format = "MIXED" + } + instancesMap[i810Key.StringCode()].Binlog_format = "STATEMENT" + instancesMap[i720Key.StringCode()].Binlog_format = "ROW" + sortInstances(instances) + test.S(t).ExpectEquals(instances[0].Key, i810Key) + test.S(t).ExpectEquals(instances[5].Key, i720Key) +} + +func TestSortInstancesSameCoordinatesDifferingVersions(t *testing.T) { + instances, instancesMap := generateTestInstances() + for _, instance := range instances { + instance.ExecBinlogCoordinates = instances[0].ExecBinlogCoordinates + } + instancesMap[i810Key.StringCode()].Version = "5.5.1" + instancesMap[i720Key.StringCode()].Version = "5.7.8" + sortInstances(instances) + test.S(t).ExpectEquals(instances[0].Key, i810Key) + test.S(t).ExpectEquals(instances[5].Key, i720Key) +} + +func TestSortInstancesDataCenterHint(t *testing.T) { + instances, instancesMap := generateTestInstances() + for _, instance := range instances { + instance.ExecBinlogCoordinates = instances[0].ExecBinlogCoordinates + instance.DataCenter = "somedc" + } + instancesMap[i810Key.StringCode()].DataCenter = "localdc" + sortInstancesDataCenterHint(instances, "localdc") + test.S(t).ExpectEquals(instances[0].Key, i810Key) +} + +func TestSortInstancesGtidErrant(t *testing.T) { + instances, instancesMap := generateTestInstances() + for _, instance := range instances { + instance.ExecBinlogCoordinates = instances[0].ExecBinlogCoordinates + instance.GtidErrant = "00020192-1111-1111-1111-111111111111:1" + } + instancesMap[i810Key.StringCode()].GtidErrant = "" + sortInstances(instances) + test.S(t).ExpectEquals(instances[0].Key, i810Key) +} + +func TestGetPriorityMajorVersionForCandidate(t *testing.T) { + { + instances, instancesMap := generateTestInstances() + + priorityMajorVersion, err := getPriorityMajorVersionForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityMajorVersion, "5.6") + + instancesMap[i810Key.StringCode()].Version = "5.5.1" + instancesMap[i720Key.StringCode()].Version = "5.7.8" + priorityMajorVersion, err = getPriorityMajorVersionForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityMajorVersion, "5.6") + + instancesMap[i710Key.StringCode()].Version = "5.7.8" + instancesMap[i720Key.StringCode()].Version = "5.7.8" + instancesMap[i730Key.StringCode()].Version = "5.7.8" + instancesMap[i830Key.StringCode()].Version = "5.7.8" + priorityMajorVersion, err = getPriorityMajorVersionForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityMajorVersion, "5.7") + } + { + instances, instancesMap := generateTestInstances() + + instancesMap[i710Key.StringCode()].Version = "5.6.9" + instancesMap[i720Key.StringCode()].Version = "5.6.9" + instancesMap[i730Key.StringCode()].Version = "5.7.8" + instancesMap[i810Key.StringCode()].Version = "5.7.8" + instancesMap[i820Key.StringCode()].Version = "5.7.8" + instancesMap[i830Key.StringCode()].Version = "5.6.9" + priorityMajorVersion, err := getPriorityMajorVersionForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityMajorVersion, "5.6") + } + // We will be testing under conditions that map iteration is in random order. + for range rand.Perm(20) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. + instances, _ := generateTestInstances() + for _, instance := range instances { + instance.Version = "5.6.9" + } + test.S(t).ExpectEquals(len(instances), 6) + // Randomly populating different elements of the array/map + perm := rand.Perm(len(instances))[0 : len(instances)/2] + for _, i := range perm { + instances[i].Version = "5.7.8" + } + // getPriorityMajorVersionForCandidate uses map iteration + priorityMajorVersion, err := getPriorityMajorVersionForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityMajorVersion, "5.6") + } +} + +func TestGetPriorityBinlogFormatForCandidate(t *testing.T) { + { + instances, instancesMap := generateTestInstances() + + priorityBinlogFormat, err := getPriorityBinlogFormatForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityBinlogFormat, "STATEMENT") + + instancesMap[i810Key.StringCode()].Binlog_format = "MIXED" + instancesMap[i720Key.StringCode()].Binlog_format = "ROW" + priorityBinlogFormat, err = getPriorityBinlogFormatForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityBinlogFormat, "STATEMENT") + + instancesMap[i710Key.StringCode()].Binlog_format = "ROW" + instancesMap[i720Key.StringCode()].Binlog_format = "ROW" + instancesMap[i730Key.StringCode()].Binlog_format = "ROW" + instancesMap[i830Key.StringCode()].Binlog_format = "ROW" + priorityBinlogFormat, err = getPriorityBinlogFormatForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityBinlogFormat, "ROW") + } + for _, lowBinlogFormat := range []string{"STATEMENT", "MIXED"} { + // We will be testing under conditions that map iteration is in random order. + for range rand.Perm(20) { // Just running many iterations to cover multiple possible map iteration ordering. Perm() is just used as an array generator here. + instances, _ := generateTestInstances() + for _, instance := range instances { + instance.Binlog_format = lowBinlogFormat + } + test.S(t).ExpectEquals(len(instances), 6) + // Randomly populating different elements of the array/map + perm := rand.Perm(len(instances))[0 : len(instances)/2] + for _, i := range perm { + instances[i].Binlog_format = "ROW" + } + // getPriorityBinlogFormatForCandidate uses map iteration + priorityBinlogFormat, err := getPriorityBinlogFormatForCandidate(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(priorityBinlogFormat, lowBinlogFormat) + } + } +} + +func TestIsGenerallyValidAsBinlogSource(t *testing.T) { + instances, _ := generateTestInstances() + for _, instance := range instances { + test.S(t).ExpectFalse(isGenerallyValidAsBinlogSource(instance)) + } + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + test.S(t).ExpectTrue(isGenerallyValidAsBinlogSource(instance)) + } +} + +func TestIsGenerallyValidAsCandidateReplica(t *testing.T) { + instances, _ := generateTestInstances() + for _, instance := range instances { + test.S(t).ExpectFalse(isGenerallyValidAsCandidateReplica(instance)) + } + for _, instance := range instances { + instance.IsLastCheckValid = true + instance.LogBinEnabled = true + instance.LogReplicationUpdatesEnabled = false + } + for _, instance := range instances { + test.S(t).ExpectFalse(isGenerallyValidAsCandidateReplica(instance)) + } + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + test.S(t).ExpectTrue(isGenerallyValidAsCandidateReplica(instance)) + } +} + +func TestIsBannedFromBeingCandidateReplica(t *testing.T) { + { + instances, _ := generateTestInstances() + for _, instance := range instances { + test.S(t).ExpectFalse(IsBannedFromBeingCandidateReplica(instance)) + } + } + { + instances, _ := generateTestInstances() + for _, instance := range instances { + instance.PromotionRule = MustNotPromoteRule + } + for _, instance := range instances { + test.S(t).ExpectTrue(IsBannedFromBeingCandidateReplica(instance)) + } + } + { + instances, _ := generateTestInstances() + config.Config.PromotionIgnoreHostnameFilters = []string{ + "i7", + "i8[0-9]0", + } + for _, instance := range instances { + test.S(t).ExpectTrue(IsBannedFromBeingCandidateReplica(instance)) + } + config.Config.PromotionIgnoreHostnameFilters = []string{} + } +} + +func TestChooseCandidateReplicaNoCandidateReplica(t *testing.T) { + instances, _ := generateTestInstances() + for _, instance := range instances { + instance.IsLastCheckValid = true + instance.LogBinEnabled = true + instance.LogReplicationUpdatesEnabled = false + } + _, _, _, _, _, err := chooseCandidateReplica(instances) + test.S(t).ExpectNotNil(err) +} + +func TestChooseCandidateReplica(t *testing.T) { + instances, _ := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 5) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplica2(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].LogReplicationUpdatesEnabled = false + instancesMap[i820Key.StringCode()].LogBinEnabled = false + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i810Key) + test.S(t).ExpectEquals(len(aheadReplicas), 2) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 3) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaSameCoordinatesDifferentVersions(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.ExecBinlogCoordinates = instances[0].ExecBinlogCoordinates + } + instancesMap[i810Key.StringCode()].Version = "5.5.1" + instancesMap[i720Key.StringCode()].Version = "5.7.8" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i810Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 5) + test.S(t).ExpectEquals(len(laterReplicas), 0) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityVersionNoLoss(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Version = "5.5.1" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 5) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityVersionLosesOne(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Version = "5.7.8" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 1) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityVersionLosesTwo(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Version = "5.7.8" + instancesMap[i820Key.StringCode()].Version = "5.7.18" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i810Key) + test.S(t).ExpectEquals(len(aheadReplicas), 2) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 3) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityVersionHigherVersionOverrides(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Version = "5.7.8" + instancesMap[i820Key.StringCode()].Version = "5.7.18" + instancesMap[i810Key.StringCode()].Version = "5.7.5" + instancesMap[i730Key.StringCode()].Version = "5.7.30" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 3) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 2) +} + +func TestChooseCandidateReplicaLosesOneDueToBinlogFormat(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.Binlog_format = "ROW" + } + instancesMap[i730Key.StringCode()].Binlog_format = "STATEMENT" + + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 1) +} + +func TestChooseCandidateReplicaPriorityBinlogFormatNoLoss(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.Binlog_format = "MIXED" + } + instancesMap[i830Key.StringCode()].Binlog_format = "STATEMENT" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 5) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityBinlogFormatLosesOne(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Binlog_format = "ROW" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 1) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityBinlogFormatLosesTwo(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Binlog_format = "ROW" + instancesMap[i820Key.StringCode()].Binlog_format = "ROW" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i810Key) + test.S(t).ExpectEquals(len(aheadReplicas), 2) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 3) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPriorityBinlogFormatRowOverrides(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].Binlog_format = "ROW" + instancesMap[i820Key.StringCode()].Binlog_format = "ROW" + instancesMap[i810Key.StringCode()].Binlog_format = "ROW" + instancesMap[i730Key.StringCode()].Binlog_format = "ROW" + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 3) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 2) +} + +func TestChooseCandidateReplicaMustNotPromoteRule(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].PromotionRule = MustNotPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 1) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPreferNotPromoteRule(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + instancesMap[i830Key.StringCode()].PromotionRule = MustNotPromoteRule + instancesMap[i820Key.StringCode()].PromotionRule = PreferNotPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 1) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPreferNotPromoteRule2(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.PromotionRule = PreferNotPromoteRule + } + instancesMap[i830Key.StringCode()].PromotionRule = MustNotPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 1) + test.S(t).ExpectEquals(len(equalReplicas), 0) + test.S(t).ExpectEquals(len(laterReplicas), 4) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPromoteRuleOrdering(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.ExecBinlogCoordinates = instancesMap[i710Key.StringCode()].ExecBinlogCoordinates + instance.PromotionRule = NeutralPromoteRule + } + instancesMap[i830Key.StringCode()].PromotionRule = PreferPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i830Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 5) + test.S(t).ExpectEquals(len(laterReplicas), 0) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPromoteRuleOrdering2(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.ExecBinlogCoordinates = instancesMap[i710Key.StringCode()].ExecBinlogCoordinates + instance.PromotionRule = PreferPromoteRule + } + instancesMap[i820Key.StringCode()].PromotionRule = MustPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i820Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 5) + test.S(t).ExpectEquals(len(laterReplicas), 0) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} + +func TestChooseCandidateReplicaPromoteRuleOrdering3(t *testing.T) { + instances, instancesMap := generateTestInstances() + applyGeneralGoodToGoReplicationParams(instances) + for _, instance := range instances { + instance.ExecBinlogCoordinates = instancesMap[i710Key.StringCode()].ExecBinlogCoordinates + instance.PromotionRule = NeutralPromoteRule + } + instancesMap[i730Key.StringCode()].PromotionRule = MustPromoteRule + instancesMap[i810Key.StringCode()].PromotionRule = PreferPromoteRule + instancesMap[i830Key.StringCode()].PromotionRule = PreferNotPromoteRule + instances = sortedReplicas(instances, NoStopReplication) + candidate, aheadReplicas, equalReplicas, laterReplicas, cannotReplicateReplicas, err := chooseCandidateReplica(instances) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(candidate.Key, i730Key) + test.S(t).ExpectEquals(len(aheadReplicas), 0) + test.S(t).ExpectEquals(len(equalReplicas), 5) + test.S(t).ExpectEquals(len(laterReplicas), 0) + test.S(t).ExpectEquals(len(cannotReplicateReplicas), 0) +} diff --git a/go/vt/orchestrator/inst/instance_utils.go b/go/vt/orchestrator/inst/instance_utils.go new file mode 100644 index 0000000000..0512775b86 --- /dev/null +++ b/go/vt/orchestrator/inst/instance_utils.go @@ -0,0 +1,260 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "regexp" + "strconv" + "strings" +) + +var ( + DowntimeLostInRecoveryMessage = "lost-in-recovery" +) + +// majorVersionsSortedByCount sorts (major) versions: +// - primary sort: by count appearances +// - secondary sort: by version +type majorVersionsSortedByCount struct { + versionsCount map[string]int + versions []string +} + +func NewMajorVersionsSortedByCount(versionsCount map[string]int) *majorVersionsSortedByCount { + versions := []string{} + for v := range versionsCount { + versions = append(versions, v) + } + return &majorVersionsSortedByCount{ + versionsCount: versionsCount, + versions: versions, + } +} + +func (this *majorVersionsSortedByCount) Len() int { return len(this.versions) } +func (this *majorVersionsSortedByCount) Swap(i, j int) { + this.versions[i], this.versions[j] = this.versions[j], this.versions[i] +} +func (this *majorVersionsSortedByCount) Less(i, j int) bool { + if this.versionsCount[this.versions[i]] == this.versionsCount[this.versions[j]] { + return this.versions[i] > this.versions[j] + } + return this.versionsCount[this.versions[i]] < this.versionsCount[this.versions[j]] +} +func (this *majorVersionsSortedByCount) First() string { + return this.versions[0] +} + +// majorVersionsSortedByCount sorts (major) versions: +// - primary sort: by count appearances +// - secondary sort: by version +type binlogFormatSortedByCount struct { + formatsCount map[string]int + formats []string +} + +func NewBinlogFormatSortedByCount(formatsCount map[string]int) *binlogFormatSortedByCount { + formats := []string{} + for v := range formatsCount { + formats = append(formats, v) + } + return &binlogFormatSortedByCount{ + formatsCount: formatsCount, + formats: formats, + } +} + +func (this *binlogFormatSortedByCount) Len() int { return len(this.formats) } +func (this *binlogFormatSortedByCount) Swap(i, j int) { + this.formats[i], this.formats[j] = this.formats[j], this.formats[i] +} +func (this *binlogFormatSortedByCount) Less(i, j int) bool { + if this.formatsCount[this.formats[i]] == this.formatsCount[this.formats[j]] { + return IsSmallerBinlogFormat(this.formats[j], this.formats[i]) + } + return this.formatsCount[this.formats[i]] < this.formatsCount[this.formats[j]] +} +func (this *binlogFormatSortedByCount) First() string { + return this.formats[0] +} + +// InstancesSorterByExec sorts instances by executed binlog coordinates +type InstancesSorterByExec struct { + instances [](*Instance) + dataCenter string +} + +func NewInstancesSorterByExec(instances [](*Instance), dataCenter string) *InstancesSorterByExec { + return &InstancesSorterByExec{ + instances: instances, + dataCenter: dataCenter, + } +} + +func (this *InstancesSorterByExec) Len() int { return len(this.instances) } +func (this *InstancesSorterByExec) Swap(i, j int) { + this.instances[i], this.instances[j] = this.instances[j], this.instances[i] +} +func (this *InstancesSorterByExec) Less(i, j int) bool { + // Returning "true" in this function means [i] is "smaller" than [j], + // which will lead to [j] be a better candidate for promotion + + // Sh*t happens. We just might get nil while attempting to discover/recover + if this.instances[i] == nil { + return false + } + if this.instances[j] == nil { + return true + } + if this.instances[i].ExecBinlogCoordinates.Equals(&this.instances[j].ExecBinlogCoordinates) { + // Secondary sorting: "smaller" if not logging replica updates + if this.instances[j].LogReplicationUpdatesEnabled && !this.instances[i].LogReplicationUpdatesEnabled { + return true + } + // Next sorting: "smaller" if of higher version (this will be reversed eventually) + // Idea is that given 5.6 a& 5.7 both of the exact position, we will want to promote + // the 5.6 on top of 5.7, as the other way around is invalid + if this.instances[j].IsSmallerMajorVersion(this.instances[i]) { + return true + } + // Next sorting: "smaller" if of larger binlog-format (this will be reversed eventually) + // Idea is that given ROW & STATEMENT both of the exact position, we will want to promote + // the STATEMENT on top of ROW, as the other way around is invalid + if this.instances[j].IsSmallerBinlogFormat(this.instances[i]) { + return true + } + // Prefer local datacenter: + if this.instances[j].DataCenter == this.dataCenter && this.instances[i].DataCenter != this.dataCenter { + return true + } + // Prefer if not having errant GTID + if this.instances[j].GtidErrant == "" && this.instances[i].GtidErrant != "" { + return true + } + // Prefer candidates: + if this.instances[j].PromotionRule.BetterThan(this.instances[i].PromotionRule) { + return true + } + } + return this.instances[i].ExecBinlogCoordinates.SmallerThan(&this.instances[j].ExecBinlogCoordinates) +} + +// filterInstancesByPattern will filter given array of instances according to regular expression pattern +func filterInstancesByPattern(instances [](*Instance), pattern string) [](*Instance) { + if pattern == "" { + return instances + } + filtered := [](*Instance){} + for _, instance := range instances { + if matched, _ := regexp.MatchString(pattern, instance.Key.DisplayString()); matched { + filtered = append(filtered, instance) + } + } + return filtered +} + +// removeInstance will remove an instance from a list of instances +func RemoveInstance(instances [](*Instance), instanceKey *InstanceKey) [](*Instance) { + if instanceKey == nil { + return instances + } + for i := len(instances) - 1; i >= 0; i-- { + if instances[i].Key.Equals(instanceKey) { + instances = append(instances[:i], instances[i+1:]...) + } + } + return instances +} + +// removeBinlogServerInstances will remove all binlog servers from given lsit +func RemoveBinlogServerInstances(instances [](*Instance)) [](*Instance) { + for i := len(instances) - 1; i >= 0; i-- { + if instances[i].IsBinlogServer() { + instances = append(instances[:i], instances[i+1:]...) + } + } + return instances +} + +// removeNilInstances +func RemoveNilInstances(instances [](*Instance)) [](*Instance) { + for i := len(instances) - 1; i >= 0; i-- { + if instances[i] == nil { + instances = append(instances[:i], instances[i+1:]...) + } + } + return instances +} + +// SemicolonTerminated is a utility function that makes sure a statement is terminated with +// a semicolon, if it isn't already +func SemicolonTerminated(statement string) string { + statement = strings.TrimSpace(statement) + statement = strings.TrimRight(statement, ";") + statement = statement + ";" + return statement +} + +// MajorVersion returns a MySQL major version number (e.g. given "5.5.36" it returns "5.5") +func MajorVersion(version string) []string { + tokens := strings.Split(version, ".") + if len(tokens) < 2 { + return []string{"0", "0"} + } + return tokens[:2] +} + +// IsSmallerMajorVersion tests two versions against another and returns true if +// the former is a smaller "major" varsion than the latter. +// e.g. 5.5.36 is NOT a smaller major version as comapred to 5.5.40, but IS as compared to 5.6.9 +func IsSmallerMajorVersion(version string, otherVersion string) bool { + thisMajorVersion := MajorVersion(version) + otherMajorVersion := MajorVersion(otherVersion) + for i := 0; i < len(thisMajorVersion); i++ { + thisToken, _ := strconv.Atoi(thisMajorVersion[i]) + otherToken, _ := strconv.Atoi(otherMajorVersion[i]) + if thisToken < otherToken { + return true + } + if thisToken > otherToken { + return false + } + } + return false +} + +// IsSmallerBinlogFormat tests two binlog formats and sees if one is "smaller" than the other. +// "smaller" binlog format means you can replicate from the smaller to the larger. +func IsSmallerBinlogFormat(binlogFormat string, otherBinlogFormat string) bool { + if binlogFormat == "STATEMENT" { + return (otherBinlogFormat == "ROW" || otherBinlogFormat == "MIXED") + } + if binlogFormat == "MIXED" { + return otherBinlogFormat == "ROW" + } + return false +} + +// RegexpMatchPatterns returns true if s matches any of the provided regexpPatterns +func RegexpMatchPatterns(s string, regexpPatterns []string) bool { + for _, filter := range regexpPatterns { + if matched, err := regexp.MatchString(filter, s); err == nil && matched { + return true + } + } + return false +} diff --git a/go/vt/orchestrator/inst/instance_utils_test.go b/go/vt/orchestrator/inst/instance_utils_test.go new file mode 100644 index 0000000000..f6247d5d6d --- /dev/null +++ b/go/vt/orchestrator/inst/instance_utils_test.go @@ -0,0 +1,30 @@ +package inst + +import ( + "testing" +) + +type testPatterns struct { + s string + patterns []string + expected bool +} + +func TestRegexpMatchPatterns(t *testing.T) { + patterns := []testPatterns{ + {"hostname", []string{}, false}, + {"hostname", []string{"blah"}, false}, + {"hostname", []string{"blah", "blah"}, false}, + {"hostname", []string{"host", "blah"}, true}, + {"hostname", []string{"blah", "host"}, true}, + {"hostname", []string{"ho.tname"}, true}, + {"hostname", []string{"ho.tname2"}, false}, + {"hostname", []string{"ho.*me"}, true}, + } + + for _, p := range patterns { + if match := RegexpMatchPatterns(p.s, p.patterns); match != p.expected { + t.Errorf("RegexpMatchPatterns failed with: %q, %+v, got: %+v, expected: %+v", p.s, p.patterns, match, p.expected) + } + } +} diff --git a/go/vt/orchestrator/inst/maintenance.go b/go/vt/orchestrator/inst/maintenance.go new file mode 100644 index 0000000000..3df916a90c --- /dev/null +++ b/go/vt/orchestrator/inst/maintenance.go @@ -0,0 +1,45 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "vitess.io/vitess/go/vt/orchestrator/config" +) + +// Maintenance indicates a maintenance entry (also in the database) +type Maintenance struct { + MaintenanceId uint + Key InstanceKey + BeginTimestamp string + SecondsElapsed uint + IsActive bool + Owner string + Reason string +} + +var maintenanceOwner string = "" + +func GetMaintenanceOwner() string { + if maintenanceOwner != "" { + return maintenanceOwner + } + return config.MaintenanceOwner +} + +func SetMaintenanceOwner(owner string) { + maintenanceOwner = owner +} diff --git a/go/vt/orchestrator/inst/maintenance_dao.go b/go/vt/orchestrator/inst/maintenance_dao.go new file mode 100644 index 0000000000..2aa9f69cd5 --- /dev/null +++ b/go/vt/orchestrator/inst/maintenance_dao.go @@ -0,0 +1,271 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/process" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +// ReadActiveMaintenance returns the list of currently active maintenance entries +func ReadActiveMaintenance() ([]Maintenance, error) { + res := []Maintenance{} + query := ` + select + database_instance_maintenance_id, + hostname, + port, + begin_timestamp, + unix_timestamp() - unix_timestamp(begin_timestamp) as seconds_elapsed, + maintenance_active, + owner, + reason + from + database_instance_maintenance + where + maintenance_active = 1 + order by + database_instance_maintenance_id + ` + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + maintenance := Maintenance{} + maintenance.MaintenanceId = m.GetUint("database_instance_maintenance_id") + maintenance.Key.Hostname = m.GetString("hostname") + maintenance.Key.Port = m.GetInt("port") + maintenance.BeginTimestamp = m.GetString("begin_timestamp") + maintenance.SecondsElapsed = m.GetUint("seconds_elapsed") + maintenance.IsActive = m.GetBool("maintenance_active") + maintenance.Owner = m.GetString("owner") + maintenance.Reason = m.GetString("reason") + + res = append(res, maintenance) + return nil + }) + + if err != nil { + log.Errore(err) + } + return res, err + +} + +// BeginBoundedMaintenance will make new maintenance entry for given instanceKey. +func BeginBoundedMaintenance(instanceKey *InstanceKey, owner string, reason string, durationSeconds uint, explicitlyBounded bool) (int64, error) { + var maintenanceToken int64 = 0 + if durationSeconds == 0 { + durationSeconds = config.MaintenanceExpireMinutes * 60 + } + res, err := db.ExecOrchestrator(` + insert ignore + into database_instance_maintenance ( + hostname, port, maintenance_active, begin_timestamp, end_timestamp, owner, reason, + processing_node_hostname, processing_node_token, explicitly_bounded + ) VALUES ( + ?, ?, 1, NOW(), NOW() + INTERVAL ? SECOND, ?, ?, + ?, ?, ? + ) + `, + instanceKey.Hostname, + instanceKey.Port, + durationSeconds, + owner, + reason, + process.ThisHostname, + util.ProcessToken.Hash, + explicitlyBounded, + ) + if err != nil { + return maintenanceToken, log.Errore(err) + } + + if affected, _ := res.RowsAffected(); affected == 0 { + err = fmt.Errorf("Cannot begin maintenance for instance: %+v; maintenance reason: %+v", instanceKey, reason) + } else { + // success + maintenanceToken, _ = res.LastInsertId() + AuditOperation("begin-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d, owner: %s, reason: %s", maintenanceToken, owner, reason)) + } + return maintenanceToken, err +} + +// BeginMaintenance will make new maintenance entry for given instanceKey. Maintenance time is unbounded +func BeginMaintenance(instanceKey *InstanceKey, owner string, reason string) (int64, error) { + return BeginBoundedMaintenance(instanceKey, owner, reason, 0, false) +} + +// EndMaintenanceByInstanceKey will terminate an active maintenance using given instanceKey as hint +func EndMaintenanceByInstanceKey(instanceKey *InstanceKey) (wasMaintenance bool, err error) { + res, err := db.ExecOrchestrator(` + update + database_instance_maintenance + set + maintenance_active = NULL, + end_timestamp = NOW() + where + hostname = ? + and port = ? + and maintenance_active = 1 + `, + instanceKey.Hostname, + instanceKey.Port, + ) + if err != nil { + return wasMaintenance, log.Errore(err) + } + + if affected, _ := res.RowsAffected(); affected > 0 { + // success + wasMaintenance = true + AuditOperation("end-maintenance", instanceKey, "") + } + return wasMaintenance, err +} + +// InMaintenance checks whether a given instance is under maintenacne +func InMaintenance(instanceKey *InstanceKey) (inMaintenance bool, err error) { + query := ` + select + count(*) > 0 as in_maintenance + from + database_instance_maintenance + where + hostname = ? + and port = ? + and maintenance_active = 1 + and end_timestamp > NOW() + ` + args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + inMaintenance = m.GetBool("in_maintenance") + return nil + }) + + return inMaintenance, log.Errore(err) +} + +// ReadMaintenanceInstanceKey will return the instanceKey for active maintenance by maintenanceToken +func ReadMaintenanceInstanceKey(maintenanceToken int64) (*InstanceKey, error) { + var res *InstanceKey + query := ` + select + hostname, port + from + database_instance_maintenance + where + database_instance_maintenance_id = ? + ` + + err := db.QueryOrchestrator(query, sqlutils.Args(maintenanceToken), func(m sqlutils.RowMap) error { + instanceKey, merr := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) + if merr != nil { + return merr + } + + res = instanceKey + return nil + }) + + return res, log.Errore(err) +} + +// EndMaintenance will terminate an active maintenance via maintenanceToken +func EndMaintenance(maintenanceToken int64) (wasMaintenance bool, err error) { + res, err := db.ExecOrchestrator(` + update + database_instance_maintenance + set + maintenance_active = NULL, + end_timestamp = NOW() + where + database_instance_maintenance_id = ? + `, + maintenanceToken, + ) + if err != nil { + return wasMaintenance, log.Errore(err) + } + if affected, _ := res.RowsAffected(); affected > 0 { + // success + wasMaintenance = true + instanceKey, _ := ReadMaintenanceInstanceKey(maintenanceToken) + AuditOperation("end-maintenance", instanceKey, fmt.Sprintf("maintenanceToken: %d", maintenanceToken)) + } + return wasMaintenance, err +} + +// ExpireMaintenance will remove the maintenance flag on old maintenances and on bounded maintenances +func ExpireMaintenance() error { + { + res, err := db.ExecOrchestrator(` + delete from + database_instance_maintenance + where + maintenance_active is null + and end_timestamp < NOW() - INTERVAL ? DAY + `, + config.MaintenancePurgeDays, + ) + if err != nil { + return log.Errore(err) + } + if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { + AuditOperation("expire-maintenance", nil, fmt.Sprintf("Purged historical entries: %d", rowsAffected)) + } + } + { + res, err := db.ExecOrchestrator(` + delete from + database_instance_maintenance + where + maintenance_active = 1 + and end_timestamp < NOW() + `, + ) + if err != nil { + return log.Errore(err) + } + if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { + AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired bounded: %d", rowsAffected)) + } + } + { + res, err := db.ExecOrchestrator(` + delete from + database_instance_maintenance + where + explicitly_bounded = 0 + and concat(processing_node_hostname, ':', processing_node_token) not in ( + select concat(hostname, ':', token) from node_health + ) + `, + ) + if err != nil { + return log.Errore(err) + } + if rowsAffected, _ := res.RowsAffected(); rowsAffected > 0 { + AuditOperation("expire-maintenance", nil, fmt.Sprintf("Expired dead: %d", rowsAffected)) + } + } + + return nil +} diff --git a/go/vt/orchestrator/inst/master_equivalence.go b/go/vt/orchestrator/inst/master_equivalence.go new file mode 100644 index 0000000000..41f50e848e --- /dev/null +++ b/go/vt/orchestrator/inst/master_equivalence.go @@ -0,0 +1,23 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +// InstanceBinlogCoordinates is a convenice wrapper for instance key + binlog coordinates +type InstanceBinlogCoordinates struct { + Key InstanceKey + Coordinates BinlogCoordinates +} diff --git a/go/vt/orchestrator/inst/master_equivalence_dao.go b/go/vt/orchestrator/inst/master_equivalence_dao.go new file mode 100644 index 0000000000..275fd18300 --- /dev/null +++ b/go/vt/orchestrator/inst/master_equivalence_dao.go @@ -0,0 +1,130 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +func WriteMasterPositionEquivalence(master1Key *InstanceKey, master1BinlogCoordinates *BinlogCoordinates, + master2Key *InstanceKey, master2BinlogCoordinates *BinlogCoordinates) error { + if master1Key.Equals(master2Key) { + // Not interesting + return nil + } + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into master_position_equivalence ( + master1_hostname, master1_port, master1_binary_log_file, master1_binary_log_pos, + master2_hostname, master2_port, master2_binary_log_file, master2_binary_log_pos, + last_suggested) + values (?, ?, ?, ?, ?, ?, ?, ?, NOW()) + on duplicate key update last_suggested=values(last_suggested) + + `, master1Key.Hostname, master1Key.Port, master1BinlogCoordinates.LogFile, master1BinlogCoordinates.LogPos, + master2Key.Hostname, master2Key.Port, master2BinlogCoordinates.LogFile, master2BinlogCoordinates.LogPos, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +func GetEquivalentMasterCoordinates(instanceCoordinates *InstanceBinlogCoordinates) (result [](*InstanceBinlogCoordinates), err error) { + query := ` + select + master1_hostname as hostname, + master1_port as port, + master1_binary_log_file as binlog_file, + master1_binary_log_pos as binlog_pos + from + master_position_equivalence + where + master2_hostname = ? + and master2_port = ? + and master2_binary_log_file = ? + and master2_binary_log_pos = ? + union + select + master2_hostname as hostname, + master2_port as port, + master2_binary_log_file as binlog_file, + master2_binary_log_pos as binlog_pos + from + master_position_equivalence + where + master1_hostname = ? + and master1_port = ? + and master1_binary_log_file = ? + and master1_binary_log_pos = ? + ` + args := sqlutils.Args( + instanceCoordinates.Key.Hostname, + instanceCoordinates.Key.Port, + instanceCoordinates.Coordinates.LogFile, + instanceCoordinates.Coordinates.LogPos, + instanceCoordinates.Key.Hostname, + instanceCoordinates.Key.Port, + instanceCoordinates.Coordinates.LogFile, + instanceCoordinates.Coordinates.LogPos, + ) + + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + equivalentCoordinates := InstanceBinlogCoordinates{} + equivalentCoordinates.Key.Hostname = m.GetString("hostname") + equivalentCoordinates.Key.Port = m.GetInt("port") + equivalentCoordinates.Coordinates.LogFile = m.GetString("binlog_file") + equivalentCoordinates.Coordinates.LogPos = m.GetInt64("binlog_pos") + + result = append(result, &equivalentCoordinates) + return nil + }) + + if err != nil { + return nil, err + } + + return result, nil +} + +func GetEquivalentBinlogCoordinatesFor(instanceCoordinates *InstanceBinlogCoordinates, belowKey *InstanceKey) (*BinlogCoordinates, error) { + possibleCoordinates, err := GetEquivalentMasterCoordinates(instanceCoordinates) + if err != nil { + return nil, err + } + for _, instanceCoordinates := range possibleCoordinates { + if instanceCoordinates.Key.Equals(belowKey) { + return &instanceCoordinates.Coordinates, nil + } + } + return nil, nil +} + +// ExpireMasterPositionEquivalence expires old master_position_equivalence +func ExpireMasterPositionEquivalence() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from master_position_equivalence + where last_suggested < NOW() - INTERVAL ? HOUR + `, config.Config.UnseenInstanceForgetHours, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} diff --git a/go/vt/orchestrator/inst/minimal_instance.go b/go/vt/orchestrator/inst/minimal_instance.go new file mode 100644 index 0000000000..4a6dd3e4f3 --- /dev/null +++ b/go/vt/orchestrator/inst/minimal_instance.go @@ -0,0 +1,15 @@ +package inst + +type MinimalInstance struct { + Key InstanceKey + MasterKey InstanceKey + ClusterName string +} + +func (this *MinimalInstance) ToInstance() *Instance { + return &Instance{ + Key: this.Key, + MasterKey: this.MasterKey, + ClusterName: this.ClusterName, + } +} diff --git a/go/vt/orchestrator/inst/oracle_gtid_set.go b/go/vt/orchestrator/inst/oracle_gtid_set.go new file mode 100644 index 0000000000..fbbac78bbc --- /dev/null +++ b/go/vt/orchestrator/inst/oracle_gtid_set.go @@ -0,0 +1,128 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "strings" +) + +// OracleGtidSet represents a set of GTID ranges as depicted by Retrieved_Gtid_Set, Executed_Gtid_Set or @@gtid_purged. +type OracleGtidSet struct { + GtidEntries [](*OracleGtidSetEntry) +} + +// Example input: `230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-10539, +// 316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-8935:8984-6124596, +// 321f5c0d-70e5-11e5-adb2-ecf4bb2262ff:1-56` +func NewOracleGtidSet(gtidSet string) (res *OracleGtidSet, err error) { + res = &OracleGtidSet{} + + gtidSet = strings.TrimSpace(gtidSet) + if gtidSet == "" { + return res, nil + } + entries := strings.Split(gtidSet, ",") + for _, entry := range entries { + entry = strings.TrimSpace(entry) + if entry == "" { + continue + } + if gtidRange, err := NewOracleGtidSetEntry(entry); err == nil { + res.GtidEntries = append(res.GtidEntries, gtidRange) + } else { + return res, err + } + } + return res, nil +} + +// RemoveUUID removes entries that belong to given UUID. +// By way of how this works there can only be one entry matching our UUID, but we generalize. +// We keep order of entries. +func (this *OracleGtidSet) RemoveUUID(uuid string) (removed bool) { + filteredEntries := [](*OracleGtidSetEntry){} + for _, entry := range this.GtidEntries { + if entry.UUID == uuid { + removed = true + } else { + filteredEntries = append(filteredEntries, entry) + } + } + if removed { + this.GtidEntries = filteredEntries + } + return removed +} + +// RetainUUID retains only entries that belong to given UUID. +func (this *OracleGtidSet) RetainUUID(uuid string) (anythingRemoved bool) { + return this.RetainUUIDs([]string{uuid}) +} + +// RetainUUIDs retains only entries that belong to given UUIDs. +func (this *OracleGtidSet) RetainUUIDs(uuids []string) (anythingRemoved bool) { + retainUUIDs := map[string]bool{} + for _, uuid := range uuids { + retainUUIDs[uuid] = true + } + filteredEntries := [](*OracleGtidSetEntry){} + for _, entry := range this.GtidEntries { + if retainUUIDs[entry.UUID] { + filteredEntries = append(filteredEntries, entry) + } else { + anythingRemoved = true + } + } + if anythingRemoved { + this.GtidEntries = filteredEntries + } + return anythingRemoved +} + +// SharedUUIDs returns UUIDs (range-less) that are shared between the two sets +func (this *OracleGtidSet) SharedUUIDs(other *OracleGtidSet) (shared []string) { + thisUUIDs := map[string]bool{} + for _, entry := range this.GtidEntries { + thisUUIDs[entry.UUID] = true + } + for _, entry := range other.GtidEntries { + if thisUUIDs[entry.UUID] { + shared = append(shared, entry.UUID) + } + } + return shared +} + +// String returns a user-friendly string representation of this entry +func (this *OracleGtidSet) Explode() (result [](*OracleGtidSetEntry)) { + for _, entries := range this.GtidEntries { + result = append(result, entries.Explode()...) + } + return result +} + +func (this *OracleGtidSet) String() string { + tokens := []string{} + for _, entry := range this.GtidEntries { + tokens = append(tokens, entry.String()) + } + return strings.Join(tokens, ",") +} + +func (this *OracleGtidSet) IsEmpty() bool { + return len(this.GtidEntries) == 0 +} diff --git a/go/vt/orchestrator/inst/oracle_gtid_set_entry.go b/go/vt/orchestrator/inst/oracle_gtid_set_entry.go new file mode 100644 index 0000000000..ed59e26444 --- /dev/null +++ b/go/vt/orchestrator/inst/oracle_gtid_set_entry.go @@ -0,0 +1,75 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "strconv" + "strings" +) + +var ( + singleValueInterval = regexp.MustCompile("^([0-9]+)$") + multiValueInterval = regexp.MustCompile("^([0-9]+)[-]([0-9]+)$") +) + +// OracleGtidSetEntry represents an entry in a set of GTID ranges, +// for example, the entry: "316d193c-70e5-11e5-adb2-ecf4bb2262ff:1-8935:8984-6124596" (may include gaps) +type OracleGtidSetEntry struct { + UUID string + Ranges string +} + +// NewOracleGtidSetEntry parses a single entry text +func NewOracleGtidSetEntry(gtidRangeString string) (*OracleGtidSetEntry, error) { + gtidRangeString = strings.TrimSpace(gtidRangeString) + tokens := strings.SplitN(gtidRangeString, ":", 2) + if len(tokens) != 2 { + return nil, fmt.Errorf("Cannot parse OracleGtidSetEntry from %s", gtidRangeString) + } + if tokens[0] == "" { + return nil, fmt.Errorf("Unexpected UUID: %s", tokens[0]) + } + if tokens[1] == "" { + return nil, fmt.Errorf("Unexpected GTID range: %s", tokens[1]) + } + gtidRange := &OracleGtidSetEntry{UUID: tokens[0], Ranges: tokens[1]} + return gtidRange, nil +} + +// String returns a user-friendly string representation of this entry +func (this *OracleGtidSetEntry) String() string { + return fmt.Sprintf("%s:%s", this.UUID, this.Ranges) +} + +// String returns a user-friendly string representation of this entry +func (this *OracleGtidSetEntry) Explode() (result [](*OracleGtidSetEntry)) { + intervals := strings.Split(this.Ranges, ":") + for _, interval := range intervals { + if submatch := multiValueInterval.FindStringSubmatch(interval); submatch != nil { + intervalStart, _ := strconv.Atoi(submatch[1]) + intervalEnd, _ := strconv.Atoi(submatch[2]) + for i := intervalStart; i <= intervalEnd; i++ { + result = append(result, &OracleGtidSetEntry{UUID: this.UUID, Ranges: fmt.Sprintf("%d", i)}) + } + } else if submatch := singleValueInterval.FindStringSubmatch(interval); submatch != nil { + result = append(result, &OracleGtidSetEntry{UUID: this.UUID, Ranges: interval}) + } + } + return result +} diff --git a/go/vt/orchestrator/inst/oracle_gtid_set_test.go b/go/vt/orchestrator/inst/oracle_gtid_set_test.go new file mode 100644 index 0000000000..a79a6173a5 --- /dev/null +++ b/go/vt/orchestrator/inst/oracle_gtid_set_test.go @@ -0,0 +1,227 @@ +package inst + +import ( + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func TestNewOracleGtidSetEntry(t *testing.T) { + { + uuidSet := "00020194-3333-3333-3333-333333333333:1-7" + entry, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(entry.UUID, "00020194-3333-3333-3333-333333333333") + test.S(t).ExpectEquals(entry.Ranges, "1-7") + } + { + uuidSet := "00020194-3333-3333-3333-333333333333:1-7:10-20" + entry, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(entry.UUID, "00020194-3333-3333-3333-333333333333") + test.S(t).ExpectEquals(entry.Ranges, "1-7:10-20") + } + { + uuidSet := "00020194-3333-3333-3333-333333333333" + _, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNotNil(err) + } +} + +func TestExplode(t *testing.T) { + { + uuidSet := "00020194-3333-3333-3333-333333333333:7" + entry, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNil(err) + + exploded := entry.Explode() + test.S(t).ExpectEquals(len(exploded), 1) + test.S(t).ExpectEquals(exploded[0].String(), "00020194-3333-3333-3333-333333333333:7") + } + { + uuidSet := "00020194-3333-3333-3333-333333333333:1-3" + entry, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNil(err) + + exploded := entry.Explode() + test.S(t).ExpectEquals(len(exploded), 3) + test.S(t).ExpectEquals(exploded[0].String(), "00020194-3333-3333-3333-333333333333:1") + test.S(t).ExpectEquals(exploded[1].String(), "00020194-3333-3333-3333-333333333333:2") + test.S(t).ExpectEquals(exploded[2].String(), "00020194-3333-3333-3333-333333333333:3") + } + { + uuidSet := "00020194-3333-3333-3333-333333333333:1-3:6-7" + entry, err := NewOracleGtidSetEntry(uuidSet) + test.S(t).ExpectNil(err) + + exploded := entry.Explode() + test.S(t).ExpectEquals(len(exploded), 5) + test.S(t).ExpectEquals(exploded[0].String(), "00020194-3333-3333-3333-333333333333:1") + test.S(t).ExpectEquals(exploded[1].String(), "00020194-3333-3333-3333-333333333333:2") + test.S(t).ExpectEquals(exploded[2].String(), "00020194-3333-3333-3333-333333333333:3") + test.S(t).ExpectEquals(exploded[3].String(), "00020194-3333-3333-3333-333333333333:6") + test.S(t).ExpectEquals(exploded[4].String(), "00020194-3333-3333-3333-333333333333:7") + } + { + gtidSetVal := "00020192-1111-1111-1111-111111111111:29-30, 00020194-3333-3333-3333-333333333333:7-8" + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + exploded := gtidSet.Explode() + test.S(t).ExpectEquals(len(exploded), 4) + test.S(t).ExpectEquals(exploded[0].String(), "00020192-1111-1111-1111-111111111111:29") + test.S(t).ExpectEquals(exploded[1].String(), "00020192-1111-1111-1111-111111111111:30") + test.S(t).ExpectEquals(exploded[2].String(), "00020194-3333-3333-3333-333333333333:7") + test.S(t).ExpectEquals(exploded[3].String(), "00020194-3333-3333-3333-333333333333:8") + } +} + +func TestNewOracleGtidSet(t *testing.T) { + { + gtidSetVal := "00020192-1111-1111-1111-111111111111:20-30, 00020194-3333-3333-3333-333333333333:7-8" + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020192-1111-1111-1111-111111111111:20-30") + test.S(t).ExpectEquals(gtidSet.GtidEntries[1].String(), "00020194-3333-3333-3333-333333333333:7-8") + } + { + gtidSetVal := " ,,, , , 00020192-1111-1111-1111-111111111111:20-30,,,, 00020194-3333-3333-3333-333333333333:7-8,, ,," + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020192-1111-1111-1111-111111111111:20-30") + test.S(t).ExpectEquals(gtidSet.GtidEntries[1].String(), "00020194-3333-3333-3333-333333333333:7-8") + } + { + gtidSetVal := " ,,, , ,, ,," + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 0) + test.S(t).ExpectTrue(gtidSet.IsEmpty()) + } +} + +func TestRemoveUUID(t *testing.T) { + gtidSetVal := "00020192-1111-1111-1111-111111111111:20-30, 00020194-3333-3333-3333-333333333333:7-8" + { + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + gtidSet.RemoveUUID("00020194-3333-3333-3333-333333333333") + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020192-1111-1111-1111-111111111111:20-30") + + removed := gtidSet.RemoveUUID(`230ea8ea-81e3-11e4-972a-e25ec4bd140a`) + test.S(t).ExpectFalse(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020192-1111-1111-1111-111111111111:20-30") + } + { + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + + gtidSet.RemoveUUID("00020192-1111-1111-1111-111111111111") + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020194-3333-3333-3333-333333333333:7-8") + + gtidSet.RemoveUUID("00020194-3333-3333-3333-333333333333") + test.S(t).ExpectTrue(gtidSet.IsEmpty()) + } +} + +func TestRetainUUID(t *testing.T) { + gtidSetVal := "00020192-1111-1111-1111-111111111111:20-30, 00020194-3333-3333-3333-333333333333:7-8" + { + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + removed := gtidSet.RetainUUID("00020194-3333-3333-3333-333333333333") + test.S(t).ExpectTrue(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020194-3333-3333-3333-333333333333:7-8") + + removed = gtidSet.RetainUUID("00020194-3333-3333-3333-333333333333") + test.S(t).ExpectFalse(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020194-3333-3333-3333-333333333333:7-8") + + removed = gtidSet.RetainUUID("230ea8ea-81e3-11e4-972a-e25ec4bd140a") + test.S(t).ExpectTrue(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 0) + } +} + +func TestRetainUUIDs(t *testing.T) { + gtidSetVal := "00020192-1111-1111-1111-111111111111:20-30, 00020194-3333-3333-3333-333333333333:7-8" + { + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 2) + removed := gtidSet.RetainUUIDs([]string{"00020194-3333-3333-3333-333333333333", "00020194-5555-5555-5555-333333333333"}) + test.S(t).ExpectTrue(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020194-3333-3333-3333-333333333333:7-8") + + removed = gtidSet.RetainUUIDs([]string{"00020194-3333-3333-3333-333333333333", "00020194-5555-5555-5555-333333333333"}) + test.S(t).ExpectFalse(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 1) + test.S(t).ExpectEquals(gtidSet.GtidEntries[0].String(), "00020194-3333-3333-3333-333333333333:7-8") + + removed = gtidSet.RetainUUIDs([]string{"230ea8ea-81e3-11e4-972a-e25ec4bd140a"}) + test.S(t).ExpectTrue(removed) + test.S(t).ExpectEquals(len(gtidSet.GtidEntries), 0) + } +} + +func TestSharedUUIDs(t *testing.T) { + gtidSetVal := "00020192-1111-1111-1111-111111111111:20-30, 00020194-3333-3333-3333-333333333333:7-8" + gtidSet, err := NewOracleGtidSet(gtidSetVal) + test.S(t).ExpectNil(err) + { + otherSet, err := NewOracleGtidSet("00020194-3333-3333-3333-333333333333:7-8,230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-2") + test.S(t).ExpectNil(err) + { + shared := gtidSet.SharedUUIDs(otherSet) + test.S(t).ExpectEquals(len(shared), 1) + test.S(t).ExpectEquals(shared[0], "00020194-3333-3333-3333-333333333333") + } + { + shared := otherSet.SharedUUIDs(gtidSet) + test.S(t).ExpectEquals(len(shared), 1) + test.S(t).ExpectEquals(shared[0], "00020194-3333-3333-3333-333333333333") + } + } + { + otherSet, err := NewOracleGtidSet("00020194-4444-4444-4444-333333333333:7-8,230ea8ea-81e3-11e4-972a-e25ec4bd140a:1-2") + test.S(t).ExpectNil(err) + { + shared := gtidSet.SharedUUIDs(otherSet) + test.S(t).ExpectEquals(len(shared), 0) + } + { + shared := otherSet.SharedUUIDs(gtidSet) + test.S(t).ExpectEquals(len(shared), 0) + } + } + { + otherSet, err := NewOracleGtidSet("00020194-3333-3333-3333-333333333333:7-8,00020192-1111-1111-1111-111111111111:1-2") + test.S(t).ExpectNil(err) + { + shared := gtidSet.SharedUUIDs(otherSet) + test.S(t).ExpectEquals(len(shared), 2) + } + { + shared := otherSet.SharedUUIDs(gtidSet) + test.S(t).ExpectEquals(len(shared), 2) + } + } +} diff --git a/go/vt/orchestrator/inst/pool.go b/go/vt/orchestrator/inst/pool.go new file mode 100644 index 0000000000..594283377e --- /dev/null +++ b/go/vt/orchestrator/inst/pool.go @@ -0,0 +1,79 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// PoolInstancesMap lists instance keys per pool name +type PoolInstancesMap map[string]([]*InstanceKey) + +type PoolInstancesSubmission struct { + CreatedAt time.Time + Pool string + DelimitedInstances string + RegisteredAt string +} + +func NewPoolInstancesSubmission(pool string, instances string) *PoolInstancesSubmission { + return &PoolInstancesSubmission{ + CreatedAt: time.Now(), + Pool: pool, + DelimitedInstances: instances, + } +} + +// ClusterPoolInstance is an instance mapping a cluster, pool & instance +type ClusterPoolInstance struct { + ClusterName string + ClusterAlias string + Pool string + Hostname string + Port int +} + +func ApplyPoolInstances(submission *PoolInstancesSubmission) error { + if submission.CreatedAt.Add(time.Duration(config.Config.InstancePoolExpiryMinutes) * time.Minute).Before(time.Now()) { + // already expired; no need to persist + return nil + } + var instanceKeys [](*InstanceKey) + if submission.DelimitedInstances != "" { + instancesStrings := strings.Split(submission.DelimitedInstances, ",") + for _, instanceString := range instancesStrings { + instanceString = strings.TrimSpace(instanceString) + instanceKey, err := ParseResolveInstanceKey(instanceString) + if config.Config.SupportFuzzyPoolHostnames { + instanceKey = ReadFuzzyInstanceKeyIfPossible(instanceKey) + } + if err != nil { + return log.Errore(err) + } + + instanceKeys = append(instanceKeys, instanceKey) + } + } + log.Debugf("submitting %d instances in %+v pool", len(instanceKeys), submission.Pool) + writePoolInstances(submission.Pool, instanceKeys) + return nil +} diff --git a/go/vt/orchestrator/inst/pool_dao.go b/go/vt/orchestrator/inst/pool_dao.go new file mode 100644 index 0000000000..c8fd337db2 --- /dev/null +++ b/go/vt/orchestrator/inst/pool_dao.go @@ -0,0 +1,156 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// writePoolInstances will write (and override) a single cluster name mapping +func writePoolInstances(pool string, instanceKeys [](*InstanceKey)) error { + writeFunc := func() error { + dbh, err := db.OpenOrchestrator() + if err != nil { + return log.Errore(err) + } + tx, err := dbh.Begin() + if _, err := tx.Exec(`delete from database_instance_pool where pool = ?`, pool); err != nil { + tx.Rollback() + return log.Errore(err) + } + query := `insert into database_instance_pool (hostname, port, pool, registered_at) values (?, ?, ?, now())` + for _, instanceKey := range instanceKeys { + if _, err := tx.Exec(query, instanceKey.Hostname, instanceKey.Port, pool); err != nil { + tx.Rollback() + return log.Errore(err) + } + } + tx.Commit() + + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// ReadClusterPoolInstances reads cluster-pool-instance associationsfor given cluster and pool +func ReadClusterPoolInstances(clusterName string, pool string) (result [](*ClusterPoolInstance), err error) { + args := sqlutils.Args() + whereClause := `` + if clusterName != "" { + whereClause = ` + where + database_instance.cluster_name = ? + and ? in ('', pool) + ` + args = append(args, clusterName, pool) + } + query := fmt.Sprintf(` + select + cluster_name, + ifnull(alias, cluster_name) as alias, + database_instance_pool.* + from + database_instance + join database_instance_pool using (hostname, port) + left join cluster_alias using (cluster_name) + %s + `, whereClause) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + clusterPoolInstance := ClusterPoolInstance{ + ClusterName: m.GetString("cluster_name"), + ClusterAlias: m.GetString("alias"), + Pool: m.GetString("pool"), + Hostname: m.GetString("hostname"), + Port: m.GetInt("port"), + } + result = append(result, &clusterPoolInstance) + return nil + }) + + if err != nil { + return nil, err + } + + return result, nil +} + +// ReadAllClusterPoolInstances returns all clusters-pools-insatnces associations +func ReadAllClusterPoolInstances() ([](*ClusterPoolInstance), error) { + return ReadClusterPoolInstances("", "") +} + +// ReadClusterPoolInstancesMap returns association of pools-to-instances for a given cluster +// and potentially for a given pool. +func ReadClusterPoolInstancesMap(clusterName string, pool string) (*PoolInstancesMap, error) { + var poolInstancesMap = make(PoolInstancesMap) + + clusterPoolInstances, err := ReadClusterPoolInstances(clusterName, pool) + if err != nil { + return nil, nil + } + for _, clusterPoolInstance := range clusterPoolInstances { + if _, ok := poolInstancesMap[clusterPoolInstance.Pool]; !ok { + poolInstancesMap[clusterPoolInstance.Pool] = [](*InstanceKey){} + } + poolInstancesMap[clusterPoolInstance.Pool] = append(poolInstancesMap[clusterPoolInstance.Pool], &InstanceKey{Hostname: clusterPoolInstance.Hostname, Port: clusterPoolInstance.Port}) + } + + return &poolInstancesMap, nil +} + +func ReadAllPoolInstancesSubmissions() ([]PoolInstancesSubmission, error) { + result := []PoolInstancesSubmission{} + query := ` + select + pool, + min(registered_at) as registered_at, + GROUP_CONCAT(concat(hostname, ':', port)) as hosts + from + database_instance_pool + group by + pool + ` + err := db.QueryOrchestrator(query, sqlutils.Args(), func(m sqlutils.RowMap) error { + submission := PoolInstancesSubmission{} + submission.Pool = m.GetString("pool") + submission.CreatedAt = m.GetTime("registered_at") + submission.RegisteredAt = m.GetString("registered_at") + submission.DelimitedInstances = m.GetString("hosts") + result = append(result, submission) + return nil + }) + + return result, log.Errore(err) +} + +// ExpirePoolInstances cleans up the database_instance_pool table from expired items +func ExpirePoolInstances() error { + _, err := db.ExecOrchestrator(` + delete + from database_instance_pool + where + registered_at < now() - interval ? minute + `, + config.Config.InstancePoolExpiryMinutes, + ) + return log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/postponed_functions.go b/go/vt/orchestrator/inst/postponed_functions.go new file mode 100644 index 0000000000..4bb7bc99d1 --- /dev/null +++ b/go/vt/orchestrator/inst/postponed_functions.go @@ -0,0 +1,69 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "sync" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +type PostponedFunctionsContainer struct { + waitGroup sync.WaitGroup + mutex sync.Mutex + descriptions []string +} + +func NewPostponedFunctionsContainer() *PostponedFunctionsContainer { + postponedFunctionsContainer := &PostponedFunctionsContainer{ + descriptions: []string{}, + } + return postponedFunctionsContainer +} + +func (this *PostponedFunctionsContainer) AddPostponedFunction(postponedFunction func() error, description string) { + this.mutex.Lock() + defer this.mutex.Unlock() + + this.descriptions = append(this.descriptions, description) + + this.waitGroup.Add(1) + go func() { + defer this.waitGroup.Done() + postponedFunction() + }() +} + +func (this *PostponedFunctionsContainer) Wait() { + log.Debugf("PostponedFunctionsContainer: waiting on %+v postponed functions", this.Len()) + this.waitGroup.Wait() + log.Debugf("PostponedFunctionsContainer: done waiting") +} + +func (this *PostponedFunctionsContainer) Len() int { + this.mutex.Lock() + defer this.mutex.Unlock() + + return len(this.descriptions) +} + +func (this *PostponedFunctionsContainer) Descriptions() []string { + this.mutex.Lock() + defer this.mutex.Unlock() + + return this.descriptions +} diff --git a/go/vt/orchestrator/inst/process.go b/go/vt/orchestrator/inst/process.go new file mode 100644 index 0000000000..0fd0788d92 --- /dev/null +++ b/go/vt/orchestrator/inst/process.go @@ -0,0 +1,32 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +// Process presents a MySQL executing thread (as observed by PROCESSLIST) +type Process struct { + InstanceHostname string + InstancePort int + Id int64 + User string + Host string + Db string + Command string + Time int64 + State string + Info string + StartedAt string +} diff --git a/go/vt/orchestrator/inst/promotion_rule.go b/go/vt/orchestrator/inst/promotion_rule.go new file mode 100644 index 0000000000..8ea89ec4aa --- /dev/null +++ b/go/vt/orchestrator/inst/promotion_rule.go @@ -0,0 +1,62 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" +) + +// CandidatePromotionRule describe the promotion preference/rule for an instance. +// It maps to promotion_rule column in candidate_database_instance +type CandidatePromotionRule string + +const ( + MustPromoteRule CandidatePromotionRule = "must" + PreferPromoteRule CandidatePromotionRule = "prefer" + NeutralPromoteRule CandidatePromotionRule = "neutral" + PreferNotPromoteRule CandidatePromotionRule = "prefer_not" + MustNotPromoteRule CandidatePromotionRule = "must_not" +) + +var promotionRuleOrderMap = map[CandidatePromotionRule]int{ + MustPromoteRule: 0, + PreferPromoteRule: 1, + NeutralPromoteRule: 2, + PreferNotPromoteRule: 3, + MustNotPromoteRule: 4, +} + +func (this *CandidatePromotionRule) BetterThan(other CandidatePromotionRule) bool { + otherOrder, ok := promotionRuleOrderMap[other] + if !ok { + return false + } + return promotionRuleOrderMap[*this] < otherOrder +} + +// ParseCandidatePromotionRule returns a CandidatePromotionRule by name. +// It returns an error if there is no known rule by the given name. +func ParseCandidatePromotionRule(ruleName string) (CandidatePromotionRule, error) { + switch ruleName { + case "prefer", "neutral", "prefer_not", "must_not": + return CandidatePromotionRule(ruleName), nil + case "must": + return CandidatePromotionRule(""), fmt.Errorf("CandidatePromotionRule: %v not supported yet", ruleName) + default: + return CandidatePromotionRule(""), fmt.Errorf("Invalid CandidatePromotionRule: %v", ruleName) + } +} diff --git a/go/vt/orchestrator/inst/replication_thread_state.go b/go/vt/orchestrator/inst/replication_thread_state.go new file mode 100644 index 0000000000..2259aa7455 --- /dev/null +++ b/go/vt/orchestrator/inst/replication_thread_state.go @@ -0,0 +1,39 @@ +/* + Copyright 2019 GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +type ReplicationThreadState int + +const ( + ReplicationThreadStateNoThread ReplicationThreadState = -1 + ReplicationThreadStateStopped = 0 + ReplicationThreadStateRunning = 1 + ReplicationThreadStateOther = 2 +) + +func ReplicationThreadStateFromStatus(status string) ReplicationThreadState { + switch status { + case "No": + return ReplicationThreadStateStopped + case "Yes": + return ReplicationThreadStateRunning + } + return ReplicationThreadStateOther +} +func (this *ReplicationThreadState) IsRunning() bool { return *this == ReplicationThreadStateRunning } +func (this *ReplicationThreadState) IsStopped() bool { return *this == ReplicationThreadStateStopped } +func (this *ReplicationThreadState) Exists() bool { return *this != ReplicationThreadStateNoThread } diff --git a/go/vt/orchestrator/inst/resolve.go b/go/vt/orchestrator/inst/resolve.go new file mode 100644 index 0000000000..c51ce3343a --- /dev/null +++ b/go/vt/orchestrator/inst/resolve.go @@ -0,0 +1,327 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "errors" + "fmt" + "net" + "regexp" + "strings" + "sync" + "time" + + "github.com/patrickmn/go-cache" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +type HostnameResolve struct { + hostname string + resolvedHostname string +} + +func (this HostnameResolve) String() string { + return fmt.Sprintf("%s %s", this.hostname, this.resolvedHostname) +} + +type HostnameUnresolve struct { + hostname string + unresolvedHostname string +} + +func (this HostnameUnresolve) String() string { + return fmt.Sprintf("%s %s", this.hostname, this.unresolvedHostname) +} + +type HostnameRegistration struct { + CreatedAt time.Time + Key InstanceKey + Hostname string +} + +func NewHostnameRegistration(instanceKey *InstanceKey, hostname string) *HostnameRegistration { + return &HostnameRegistration{ + CreatedAt: time.Now(), + Key: *instanceKey, + Hostname: hostname, + } +} + +func NewHostnameDeregistration(instanceKey *InstanceKey) *HostnameRegistration { + return &HostnameRegistration{ + CreatedAt: time.Now(), + Key: *instanceKey, + Hostname: "", + } +} + +var hostnameResolvesLightweightCache *cache.Cache +var hostnameResolvesLightweightCacheInit = &sync.Mutex{} +var hostnameResolvesLightweightCacheLoadedOnceFromDB bool = false +var hostnameIPsCache = cache.New(10*time.Minute, time.Minute) + +func init() { + if config.Config.ExpiryHostnameResolvesMinutes < 1 { + config.Config.ExpiryHostnameResolvesMinutes = 1 + } +} + +func getHostnameResolvesLightweightCache() *cache.Cache { + hostnameResolvesLightweightCacheInit.Lock() + defer hostnameResolvesLightweightCacheInit.Unlock() + if hostnameResolvesLightweightCache == nil { + hostnameResolvesLightweightCache = cache.New(time.Duration(config.Config.ExpiryHostnameResolvesMinutes)*time.Minute, time.Minute) + } + return hostnameResolvesLightweightCache +} + +func HostnameResolveMethodIsNone() bool { + return strings.ToLower(config.Config.HostnameResolveMethod) == "none" +} + +// GetCNAME resolves an IP or hostname into a normalized valid CNAME +func GetCNAME(hostname string) (string, error) { + res, err := net.LookupCNAME(hostname) + if err != nil { + return hostname, err + } + res = strings.TrimRight(res, ".") + return res, nil +} + +func resolveHostname(hostname string) (string, error) { + switch strings.ToLower(config.Config.HostnameResolveMethod) { + case "none": + return hostname, nil + case "default": + return hostname, nil + case "cname": + return GetCNAME(hostname) + case "ip": + return getHostnameIP(hostname) + } + return hostname, nil +} + +// Attempt to resolve a hostname. This may return a database cached hostname or otherwise +// it may resolve the hostname via CNAME +func ResolveHostname(hostname string) (string, error) { + hostname = strings.TrimSpace(hostname) + if hostname == "" { + return hostname, errors.New("Will not resolve empty hostname") + } + if strings.Contains(hostname, ",") { + return hostname, fmt.Errorf("Will not resolve multi-hostname: %+v", hostname) + } + if (&InstanceKey{Hostname: hostname}).IsDetached() { + // quietly abort. Nothing to do. The hostname is detached for a reason: it + // will not be resolved, for sure. + return hostname, nil + } + + // First go to lightweight cache + if resolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname); found { + return resolvedHostname.(string), nil + } + + if !hostnameResolvesLightweightCacheLoadedOnceFromDB { + // A continuous-discovery will first make sure to load all resolves from DB. + // However cli does not do so. + // Anyway, it seems like the cache was not loaded from DB. Before doing real resolves, + // let's try and get the resolved hostname from database. + if !HostnameResolveMethodIsNone() { + go func() { + if resolvedHostname, err := ReadResolvedHostname(hostname); err == nil && resolvedHostname != "" { + getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, 0) + } + }() + } + } + + // Unfound: resolve! + log.Debugf("Hostname unresolved yet: %s", hostname) + resolvedHostname, err := resolveHostname(hostname) + if config.Config.RejectHostnameResolvePattern != "" { + // Reject, don't even cache + if matched, _ := regexp.MatchString(config.Config.RejectHostnameResolvePattern, resolvedHostname); matched { + log.Warningf("ResolveHostname: %+v resolved to %+v but rejected due to RejectHostnameResolvePattern '%+v'", hostname, resolvedHostname, config.Config.RejectHostnameResolvePattern) + return hostname, nil + } + } + + if err != nil { + // Problem. What we'll do is cache the hostname for just one minute, so as to avoid flooding requests + // on one hand, yet make it refresh shortly on the other hand. Anyway do not write to database. + getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, time.Minute) + return hostname, err + } + // Good result! Cache it, also to DB + log.Debugf("Cache hostname resolve %s as %s", hostname, resolvedHostname) + go UpdateResolvedHostname(hostname, resolvedHostname) + return resolvedHostname, nil +} + +// UpdateResolvedHostname will store the given resolved hostname in cache +// Returns false when the key already existed with same resolved value (similar +// to AFFECTED_ROWS() in mysql) +func UpdateResolvedHostname(hostname string, resolvedHostname string) bool { + if resolvedHostname == "" { + return false + } + if existingResolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname); found && (existingResolvedHostname == resolvedHostname) { + return false + } + getHostnameResolvesLightweightCache().Set(hostname, resolvedHostname, 0) + if !HostnameResolveMethodIsNone() { + WriteResolvedHostname(hostname, resolvedHostname) + } + return true +} + +func LoadHostnameResolveCache() error { + if !HostnameResolveMethodIsNone() { + return loadHostnameResolveCacheFromDatabase() + } + return nil +} + +func loadHostnameResolveCacheFromDatabase() error { + allHostnamesResolves, err := ReadAllHostnameResolves() + if err != nil { + return err + } + for _, hostnameResolve := range allHostnamesResolves { + getHostnameResolvesLightweightCache().Set(hostnameResolve.hostname, hostnameResolve.resolvedHostname, 0) + } + hostnameResolvesLightweightCacheLoadedOnceFromDB = true + return nil +} + +func FlushNontrivialResolveCacheToDatabase() error { + if HostnameResolveMethodIsNone() { + return nil + } + items, _ := HostnameResolveCache() + for hostname := range items { + resolvedHostname, found := getHostnameResolvesLightweightCache().Get(hostname) + if found && (resolvedHostname.(string) != hostname) { + WriteResolvedHostname(hostname, resolvedHostname.(string)) + } + } + return nil +} + +func ResetHostnameResolveCache() error { + err := deleteHostnameResolves() + getHostnameResolvesLightweightCache().Flush() + hostnameResolvesLightweightCacheLoadedOnceFromDB = false + return err +} + +func HostnameResolveCache() (map[string]cache.Item, error) { + return getHostnameResolvesLightweightCache().Items(), nil +} + +func UnresolveHostname(instanceKey *InstanceKey) (InstanceKey, bool, error) { + if *config.RuntimeCLIFlags.SkipUnresolve { + return *instanceKey, false, nil + } + unresolvedHostname, err := readUnresolvedHostname(instanceKey.Hostname) + if err != nil { + return *instanceKey, false, log.Errore(err) + } + if unresolvedHostname == instanceKey.Hostname { + // unchanged. Nothing to do + return *instanceKey, false, nil + } + // We unresovled to a different hostname. We will now re-resolve to double-check! + unresolvedKey := &InstanceKey{Hostname: unresolvedHostname, Port: instanceKey.Port} + + instance, err := ReadTopologyInstance(unresolvedKey) + if err != nil { + return *instanceKey, false, log.Errore(err) + } + if instance.IsBinlogServer() && config.Config.SkipBinlogServerUnresolveCheck { + // Do nothing. Everything is assumed to be fine. + } else if instance.Key.Hostname != instanceKey.Hostname { + // Resolve(Unresolve(hostname)) != hostname ==> Bad; reject + if *config.RuntimeCLIFlags.SkipUnresolveCheck { + return *instanceKey, false, nil + } + return *instanceKey, false, log.Errorf("Error unresolving; hostname=%s, unresolved=%s, re-resolved=%s; mismatch. Skip/ignore with --skip-unresolve-check", instanceKey.Hostname, unresolvedKey.Hostname, instance.Key.Hostname) + } + return *unresolvedKey, true, nil +} + +func RegisterHostnameUnresolve(registration *HostnameRegistration) (err error) { + if registration.Hostname == "" { + return DeleteHostnameUnresolve(®istration.Key) + } + if registration.CreatedAt.Add(time.Duration(config.Config.ExpiryHostnameResolvesMinutes) * time.Minute).Before(time.Now()) { + // already expired. + return nil + } + return WriteHostnameUnresolve(®istration.Key, registration.Hostname) +} + +func extractIPs(ips []net.IP) (ipv4String string, ipv6String string) { + for _, ip := range ips { + if ip4 := ip.To4(); ip4 != nil { + ipv4String = ip.String() + } else { + ipv6String = ip.String() + } + } + return ipv4String, ipv6String +} + +func getHostnameIPs(hostname string) (ips []net.IP, fromCache bool, err error) { + if ips, found := hostnameIPsCache.Get(hostname); found { + return ips.([]net.IP), true, nil + } + ips, err = net.LookupIP(hostname) + if err != nil { + return ips, false, log.Errore(err) + } + hostnameIPsCache.Set(hostname, ips, cache.DefaultExpiration) + return ips, false, nil +} + +func getHostnameIP(hostname string) (ipString string, err error) { + ips, _, err := getHostnameIPs(hostname) + if err != nil { + return ipString, err + } + ipv4String, ipv6String := extractIPs(ips) + if ipv4String != "" { + return ipv4String, nil + } + return ipv6String, nil +} + +func ResolveHostnameIPs(hostname string) error { + ips, fromCache, err := getHostnameIPs(hostname) + if err != nil { + return err + } + if fromCache { + return nil + } + ipv4String, ipv6String := extractIPs(ips) + return writeHostnameIPs(hostname, ipv4String, ipv6String) +} diff --git a/go/vt/orchestrator/inst/resolve_dao.go b/go/vt/orchestrator/inst/resolve_dao.go new file mode 100644 index 0000000000..19a8f19f7d --- /dev/null +++ b/go/vt/orchestrator/inst/resolve_dao.go @@ -0,0 +1,366 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +var writeResolvedHostnameCounter = metrics.NewCounter() +var writeUnresolvedHostnameCounter = metrics.NewCounter() +var readResolvedHostnameCounter = metrics.NewCounter() +var readUnresolvedHostnameCounter = metrics.NewCounter() +var readAllResolvedHostnamesCounter = metrics.NewCounter() + +func init() { + metrics.Register("resolve.write_resolved", writeResolvedHostnameCounter) + metrics.Register("resolve.write_unresolved", writeUnresolvedHostnameCounter) + metrics.Register("resolve.read_resolved", readResolvedHostnameCounter) + metrics.Register("resolve.read_unresolved", readUnresolvedHostnameCounter) + metrics.Register("resolve.read_resolved_all", readAllResolvedHostnamesCounter) +} + +// WriteResolvedHostname stores a hostname and the resolved hostname to backend database +func WriteResolvedHostname(hostname string, resolvedHostname string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into + hostname_resolve (hostname, resolved_hostname, resolved_timestamp) + values + (?, ?, NOW()) + on duplicate key update + resolved_hostname = VALUES(resolved_hostname), + resolved_timestamp = VALUES(resolved_timestamp) + `, + hostname, + resolvedHostname) + if err != nil { + return log.Errore(err) + } + if hostname != resolvedHostname { + // history is only interesting when there's actually something to resolve... + _, err = db.ExecOrchestrator(` + insert into + hostname_resolve_history (hostname, resolved_hostname, resolved_timestamp) + values + (?, ?, NOW()) + on duplicate key update + hostname=values(hostname), + resolved_timestamp=values(resolved_timestamp) + `, + hostname, + resolvedHostname) + } + writeResolvedHostnameCounter.Inc(1) + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// ReadResolvedHostname returns the resolved hostname given a hostname, or empty if not exists +func ReadResolvedHostname(hostname string) (string, error) { + var resolvedHostname string = "" + + query := ` + select + resolved_hostname + from + hostname_resolve + where + hostname = ? + ` + + err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { + resolvedHostname = m.GetString("resolved_hostname") + return nil + }) + readResolvedHostnameCounter.Inc(1) + + if err != nil { + log.Errore(err) + } + return resolvedHostname, err +} + +func ReadAllHostnameResolves() ([]HostnameResolve, error) { + res := []HostnameResolve{} + query := ` + select + hostname, + resolved_hostname + from + hostname_resolve + ` + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + hostnameResolve := HostnameResolve{hostname: m.GetString("hostname"), resolvedHostname: m.GetString("resolved_hostname")} + + res = append(res, hostnameResolve) + return nil + }) + readAllResolvedHostnamesCounter.Inc(1) + + if err != nil { + log.Errore(err) + } + return res, err +} + +// ReadAllHostnameUnresolves returns the content of the hostname_unresolve table +func ReadAllHostnameUnresolves() ([]HostnameUnresolve, error) { + unres := []HostnameUnresolve{} + query := ` + select + hostname, + unresolved_hostname + from + hostname_unresolve + ` + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + hostnameUnresolve := HostnameUnresolve{hostname: m.GetString("hostname"), unresolvedHostname: m.GetString("unresolved_hostname")} + + unres = append(unres, hostnameUnresolve) + return nil + }) + + return unres, log.Errore(err) +} + +// ReadAllHostnameUnresolves returns the content of the hostname_unresolve table +func ReadAllHostnameUnresolvesRegistrations() (registrations []HostnameRegistration, err error) { + unresolves, err := ReadAllHostnameUnresolves() + if err != nil { + return registrations, err + } + for _, unresolve := range unresolves { + registration := NewHostnameRegistration(&InstanceKey{Hostname: unresolve.hostname}, unresolve.unresolvedHostname) + registrations = append(registrations, *registration) + } + return registrations, nil +} + +// readUnresolvedHostname reverse-reads hostname resolve. It returns a hostname which matches given pattern and resovles to resolvedHostname, +// or, in the event no such hostname is found, the given resolvedHostname, unchanged. +func readUnresolvedHostname(hostname string) (string, error) { + unresolvedHostname := hostname + + query := ` + select + unresolved_hostname + from + hostname_unresolve + where + hostname = ? + ` + + err := db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { + unresolvedHostname = m.GetString("unresolved_hostname") + return nil + }) + readUnresolvedHostnameCounter.Inc(1) + + if err != nil { + log.Errore(err) + } + return unresolvedHostname, err +} + +// readMissingHostnamesToResolve gets those (unresolved, e.g. VIP) hostnames that *should* be present in +// the hostname_resolve table, but aren't. +func readMissingKeysToResolve() (result InstanceKeyMap, err error) { + query := ` + select + hostname_unresolve.unresolved_hostname, + database_instance.port + from + database_instance + join hostname_unresolve on (database_instance.hostname = hostname_unresolve.hostname) + left join hostname_resolve on (database_instance.hostname = hostname_resolve.resolved_hostname) + where + hostname_resolve.hostname is null + ` + + err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + instanceKey := InstanceKey{Hostname: m.GetString("unresolved_hostname"), Port: m.GetInt("port")} + result.AddKey(instanceKey) + return nil + }) + + if err != nil { + log.Errore(err) + } + return result, err +} + +// WriteHostnameUnresolve upserts an entry in hostname_unresolve +func WriteHostnameUnresolve(instanceKey *InstanceKey, unresolvedHostname string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into hostname_unresolve ( + hostname, + unresolved_hostname, + last_registered) + values (?, ?, NOW()) + on duplicate key update + unresolved_hostname=values(unresolved_hostname), + last_registered=now() + `, instanceKey.Hostname, unresolvedHostname, + ) + if err != nil { + return log.Errore(err) + } + _, err = db.ExecOrchestrator(` + replace into hostname_unresolve_history ( + hostname, + unresolved_hostname, + last_registered) + values (?, ?, NOW()) + `, instanceKey.Hostname, unresolvedHostname, + ) + writeUnresolvedHostnameCounter.Inc(1) + return nil + } + return ExecDBWriteFunc(writeFunc) +} + +// DeleteHostnameUnresolve removes an unresolve entry +func DeleteHostnameUnresolve(instanceKey *InstanceKey) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from hostname_unresolve + where hostname=? + `, instanceKey.Hostname, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// ExpireHostnameUnresolve expires hostname_unresolve entries that haven't been updated recently. +func ExpireHostnameUnresolve() error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + delete from hostname_unresolve + where last_registered < NOW() - INTERVAL ? MINUTE + `, config.Config.ExpiryHostnameResolvesMinutes, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// ForgetExpiredHostnameResolves +func ForgetExpiredHostnameResolves() error { + _, err := db.ExecOrchestrator(` + delete + from hostname_resolve + where + resolved_timestamp < NOW() - interval ? minute`, + 2*config.Config.ExpiryHostnameResolvesMinutes, + ) + return err +} + +// DeleteInvalidHostnameResolves removes invalid resolves. At this time these are: +// - infinite loop resolves (A->B and B->A), remove earlier mapping +func DeleteInvalidHostnameResolves() error { + var invalidHostnames []string + + query := ` + select + early.hostname + from + hostname_resolve as latest + join hostname_resolve early on (latest.resolved_hostname = early.hostname and latest.hostname = early.resolved_hostname) + where + latest.hostname != latest.resolved_hostname + and latest.resolved_timestamp > early.resolved_timestamp + ` + + err := db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + invalidHostnames = append(invalidHostnames, m.GetString("hostname")) + return nil + }) + if err != nil { + return err + } + + for _, invalidHostname := range invalidHostnames { + _, err = db.ExecOrchestrator(` + delete + from hostname_resolve + where + hostname = ?`, + invalidHostname, + ) + log.Errore(err) + } + return err +} + +// deleteHostnameResolves compeltely erases the database cache +func deleteHostnameResolves() error { + _, err := db.ExecOrchestrator(` + delete + from hostname_resolve`, + ) + return err +} + +// writeHostnameIPs stroes an ipv4 and ipv6 associated witha hostname, if available +func writeHostnameIPs(hostname string, ipv4String string, ipv6String string) error { + writeFunc := func() error { + _, err := db.ExecOrchestrator(` + insert into + hostname_ips (hostname, ipv4, ipv6, last_updated) + values + (?, ?, ?, NOW()) + on duplicate key update + ipv4 = VALUES(ipv4), + ipv6 = VALUES(ipv6), + last_updated = VALUES(last_updated) + `, + hostname, + ipv4String, + ipv6String, + ) + return log.Errore(err) + } + return ExecDBWriteFunc(writeFunc) +} + +// readUnresolvedHostname reverse-reads hostname resolve. It returns a hostname which matches given pattern and resovles to resolvedHostname, +// or, in the event no such hostname is found, the given resolvedHostname, unchanged. +func readHostnameIPs(hostname string) (ipv4 string, ipv6 string, err error) { + query := ` + select + ipv4, ipv6 + from + hostname_ips + where + hostname = ? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(hostname), func(m sqlutils.RowMap) error { + ipv4 = m.GetString("ipv4") + ipv6 = m.GetString("ipv6") + return nil + }) + return ipv4, ipv6, log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/tag.go b/go/vt/orchestrator/inst/tag.go new file mode 100644 index 0000000000..5d54d46cdb --- /dev/null +++ b/go/vt/orchestrator/inst/tag.go @@ -0,0 +1,122 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + "regexp" + "strings" +) + +type Tag struct { + TagName string + TagValue string + HasValue bool + Negate bool +} + +var ( + negateTagEqualsRegexp = regexp.MustCompile("^~([^=]+)=(.*)$") + TagEqualsRegexp = regexp.MustCompile("^([^=]+)=(.*)$") + negateTagExistsRegexp = regexp.MustCompile("^~([^=]+)$") + tagExistsRegexp = regexp.MustCompile("^([^=]+)$") +) + +func NewTag(tagName string, tagValue string) (*Tag, error) { + tagName = strings.TrimSpace(tagName) + if tagName == "" { + return nil, fmt.Errorf("NewTag: empty tag name") + } + return &Tag{TagName: tagName, TagValue: tagValue}, nil +} + +func ParseTag(tagString string) (*Tag, error) { + tagString = strings.Replace(tagString, "!", "~", -1) + tagString = strings.TrimSpace(tagString) + + if submatch := negateTagEqualsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { + return &Tag{ + TagName: submatch[1], + TagValue: submatch[2], + HasValue: true, + Negate: true, + }, nil + } else if submatch := TagEqualsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { + return &Tag{ + TagName: submatch[1], + TagValue: submatch[2], + HasValue: true, + }, nil + } else if submatch := negateTagExistsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { + return &Tag{ + TagName: submatch[1], + Negate: true, + }, nil + } else if submatch := tagExistsRegexp.FindStringSubmatch(tagString); len(submatch) > 0 { + return &Tag{ + TagName: submatch[1], + }, nil + } + return nil, fmt.Errorf("Unable to parse tag: %s", tagString) +} + +func (tag *Tag) String() string { + return fmt.Sprintf("%s=%s", tag.TagName, tag.TagValue) +} + +func (tag *Tag) Display() string { + if tag.TagValue == "" { + return fmt.Sprintf("%s", tag.TagName) + } else { + return fmt.Sprintf("%s=%s", tag.TagName, tag.TagValue) + } +} + +func ParseIntersectTags(tagsString string) (tags [](*Tag), err error) { + for _, tagString := range strings.Split(tagsString, ",") { + tag, err := ParseTag(tagString) + if err != nil { + return tags, err + } + tags = append(tags, tag) + } + return tags, nil +} + +type InstanceTag struct { + Key InstanceKey + T Tag +} + +func GetInstanceKeysByTags(tagsString string) (tagged *InstanceKeyMap, err error) { + tags, err := ParseIntersectTags(tagsString) + if err != nil { + return tagged, err + } + for i, tag := range tags { + taggedByTag, err := GetInstanceKeysByTag(tag) + if err != nil { + return tagged, err + } + if i == 0 { + tagged = taggedByTag + } else { + tagged = tagged.Intersect(taggedByTag) + } + } + return tagged, nil +} diff --git a/go/vt/orchestrator/inst/tag_dao.go b/go/vt/orchestrator/inst/tag_dao.go new file mode 100644 index 0000000000..fb5fea1899 --- /dev/null +++ b/go/vt/orchestrator/inst/tag_dao.go @@ -0,0 +1,192 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +import ( + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +func PutInstanceTag(instanceKey *InstanceKey, tag *Tag) (err error) { + _, err = db.ExecOrchestrator(` + insert + into database_instance_tags ( + hostname, port, tag_name, tag_value, last_updated + ) VALUES ( + ?, ?, ?, ?, NOW() + ) + on duplicate key update + tag_value=values(tag_value), + last_updated=values(last_updated) + `, + instanceKey.Hostname, + instanceKey.Port, + tag.TagName, + tag.TagValue, + ) + return err +} + +func Untag(instanceKey *InstanceKey, tag *Tag) (tagged *InstanceKeyMap, err error) { + if tag == nil { + return nil, log.Errorf("Untag: tag is nil") + } + if tag.Negate { + return nil, log.Errorf("Untag: does not support negation") + } + if instanceKey == nil && !tag.HasValue { + return nil, log.Errorf("Untag: either indicate an instance or a tag value. Will not delete on-valued tag across instances") + } + clause := `` + args := sqlutils.Args() + if tag.HasValue { + clause = `tag_name=? and tag_value=?` + args = append(args, tag.TagName, tag.TagValue) + } else { + clause = `tag_name=?` + args = append(args, tag.TagName) + } + if instanceKey != nil { + clause = fmt.Sprintf("%s and hostname=? and port=?", clause) + args = append(args, instanceKey.Hostname, instanceKey.Port) + } + tagged = NewInstanceKeyMap() + query := fmt.Sprintf(` + select + hostname, + port + from + database_instance_tags + where + %s + order by hostname, port + `, clause, + ) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + key, _ := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) + tagged.AddKey(*key) + return nil + }) + + query = fmt.Sprintf(` + delete from + database_instance_tags + where + %s + `, clause, + ) + if _, err = db.ExecOrchestrator(query, args...); err != nil { + return tagged, log.Errore(err) + } + AuditOperation("delete-instance-tag", instanceKey, tag.String()) + return tagged, nil +} + +func ReadInstanceTag(instanceKey *InstanceKey, tag *Tag) (tagExists bool, err error) { + query := ` + select + tag_value + from + database_instance_tags + where + hostname = ? + and port = ? + and tag_name = ? + ` + args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port, tag.TagName) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + tag.TagValue = m.GetString("tag_value") + tagExists = true + return nil + }) + + return tagExists, log.Errore(err) +} + +func InstanceTagExists(instanceKey *InstanceKey, tag *Tag) (tagExists bool, err error) { + return ReadInstanceTag(instanceKey, &Tag{TagName: tag.TagName}) +} + +func ReadInstanceTags(instanceKey *InstanceKey) (tags [](*Tag), err error) { + tags = [](*Tag){} + query := ` + select + tag_name, tag_value + from + database_instance_tags + where + hostname = ? + and port = ? + order by tag_name + ` + args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + tag := &Tag{ + TagName: m.GetString("tag_name"), + TagValue: m.GetString("tag_value"), + } + tags = append(tags, tag) + return nil + }) + + return tags, log.Errore(err) +} + +func GetInstanceKeysByTag(tag *Tag) (tagged *InstanceKeyMap, err error) { + if tag == nil { + return nil, log.Errorf("GetInstanceKeysByTag: tag is nil") + } + clause := `` + args := sqlutils.Args() + if tag.HasValue && !tag.Negate { + // exists and equals + clause = `tag_name=? and tag_value=?` + args = append(args, tag.TagName, tag.TagValue) + } else if !tag.HasValue && !tag.Negate { + // exists + clause = `tag_name=?` + args = append(args, tag.TagName) + } else if tag.HasValue && tag.Negate { + // exists and not equal + clause = `tag_name=? and tag_value!=?` + args = append(args, tag.TagName, tag.TagValue) + } else if !tag.HasValue && tag.Negate { + // does not exist + clause = `1=1 group by hostname, port having sum(tag_name=?)=0` + args = append(args, tag.TagName) + } + tagged = NewInstanceKeyMap() + query := fmt.Sprintf(` + select + hostname, + port + from + database_instance_tags + where + %s + order by hostname, port + `, clause) + err = db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + key, _ := NewResolveInstanceKey(m.GetString("hostname"), m.GetInt("port")) + tagged.AddKey(*key) + return nil + }) + return tagged, log.Errore(err) +} diff --git a/go/vt/orchestrator/inst/tag_test.go b/go/vt/orchestrator/inst/tag_test.go new file mode 100644 index 0000000000..8551cd7b70 --- /dev/null +++ b/go/vt/orchestrator/inst/tag_test.go @@ -0,0 +1,140 @@ +package inst + +import ( + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func TestParseTag(t *testing.T) { + { + tag, err := ParseTag("") + test.S(t).ExpectTrue(tag == nil) + test.S(t).ExpectNotNil(err) + } + { + tag, err := ParseTag("=") + test.S(t).ExpectTrue(tag == nil) + test.S(t).ExpectNotNil(err) + } + { + tag, err := ParseTag("=backup") + test.S(t).ExpectTrue(tag == nil) + test.S(t).ExpectNotNil(err) + } + { + tag, err := ParseTag(" =backup") + test.S(t).ExpectTrue(tag == nil) + test.S(t).ExpectNotNil(err) + } + { + tag, err := ParseTag("role") + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(tag != nil) + test.S(t).ExpectEquals(tag.TagName, "role") + test.S(t).ExpectEquals(tag.TagValue, "") + test.S(t).ExpectFalse(tag.Negate) + test.S(t).ExpectFalse(tag.HasValue) + + test.S(t).ExpectEquals(tag.String(), "role=") + } + { + tag, err := ParseTag("role=") + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(tag != nil) + test.S(t).ExpectEquals(tag.TagName, "role") + test.S(t).ExpectEquals(tag.TagValue, "") + test.S(t).ExpectFalse(tag.Negate) + test.S(t).ExpectTrue(tag.HasValue) + + test.S(t).ExpectEquals(tag.String(), "role=") + + } + { + tag, err := ParseTag("role=backup") + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(tag != nil) + test.S(t).ExpectEquals(tag.TagName, "role") + test.S(t).ExpectEquals(tag.TagValue, "backup") + test.S(t).ExpectFalse(tag.Negate) + test.S(t).ExpectTrue(tag.HasValue) + + test.S(t).ExpectEquals(tag.String(), "role=backup") + } + { + tag, err := ParseTag("!role") + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(tag != nil) + test.S(t).ExpectEquals(tag.TagName, "role") + test.S(t).ExpectTrue(tag.Negate) + test.S(t).ExpectFalse(tag.HasValue) + } + { + tag, err := ParseTag("~role=backup") + test.S(t).ExpectNil(err) + test.S(t).ExpectTrue(tag != nil) + test.S(t).ExpectEquals(tag.TagName, "role") + test.S(t).ExpectEquals(tag.TagValue, "backup") + test.S(t).ExpectTrue(tag.Negate) + test.S(t).ExpectTrue(tag.HasValue) + } +} + +func TestParseIntersectTags(t *testing.T) { + { + _, err := ParseIntersectTags("") + test.S(t).ExpectNotNil(err) + } + { + _, err := ParseIntersectTags(",") + test.S(t).ExpectNotNil(err) + } + { + _, err := ParseIntersectTags(",,,") + test.S(t).ExpectNotNil(err) + } + { + _, err := ParseIntersectTags("role,") + test.S(t).ExpectNotNil(err) + } + { + tags, err := ParseIntersectTags("role") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(len(tags), 1) + + test.S(t).ExpectEquals(tags[0].TagName, "role") + test.S(t).ExpectEquals(tags[0].TagValue, "") + test.S(t).ExpectFalse(tags[0].Negate) + test.S(t).ExpectFalse(tags[0].HasValue) + } + { + tags, err := ParseIntersectTags("role,dc") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(len(tags), 2) + + test.S(t).ExpectEquals(tags[0].TagName, "role") + test.S(t).ExpectEquals(tags[0].TagValue, "") + test.S(t).ExpectFalse(tags[0].Negate) + test.S(t).ExpectFalse(tags[0].HasValue) + + test.S(t).ExpectEquals(tags[1].TagName, "dc") + test.S(t).ExpectEquals(tags[1].TagValue, "") + test.S(t).ExpectFalse(tags[1].Negate) + test.S(t).ExpectFalse(tags[1].HasValue) + } + { + tags, err := ParseIntersectTags("role=backup, !dc=ny") + test.S(t).ExpectNil(err) + test.S(t).ExpectEquals(len(tags), 2) + + test.S(t).ExpectEquals(tags[0].TagName, "role") + test.S(t).ExpectEquals(tags[0].TagValue, "backup") + test.S(t).ExpectFalse(tags[0].Negate) + test.S(t).ExpectTrue(tags[0].HasValue) + + test.S(t).ExpectEquals(tags[1].TagName, "dc") + test.S(t).ExpectEquals(tags[1].TagValue, "ny") + test.S(t).ExpectTrue(tags[1].Negate) + test.S(t).ExpectTrue(tags[1].HasValue) + } +} diff --git a/go/vt/orchestrator/inst/write_buffer.go b/go/vt/orchestrator/inst/write_buffer.go new file mode 100644 index 0000000000..7745b13321 --- /dev/null +++ b/go/vt/orchestrator/inst/write_buffer.go @@ -0,0 +1,133 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package inst + +/* + query holds information about query metrics and records the time taken + waiting before doing the query plus the time taken executing the query. +*/ +import ( + "time" + + "vitess.io/vitess/go/vt/orchestrator/collection" + "vitess.io/vitess/go/vt/orchestrator/config" + + "github.com/montanaflynn/stats" +) + +// Metric records query metrics of backend writes that go through +// a sized channel. It allows us to compare the time waiting to +// execute the query against the time needed to run it and in a +// "sized channel" the wait time may be significant and is good to +// measure. +type WriteBufferMetric struct { + Timestamp time.Time // time the metric was started + Instances int // number of flushed instances + WaitLatency time.Duration // waiting before flush + WriteLatency time.Duration // time writing to backend +} + +// When records the timestamp of the start of the recording +func (m WriteBufferMetric) When() time.Time { + return m.Timestamp +} + +type AggregatedWriteBufferMetric struct { + InstanceWriteBufferSize int // config setting + InstanceFlushIntervalMilliseconds int // config setting + CountInstances int + MaxInstances float64 + MeanInstances float64 + MedianInstances float64 + P95Instances float64 + MaxWaitSeconds float64 + MeanWaitSeconds float64 + MedianWaitSeconds float64 + P95WaitSeconds float64 + MaxWriteSeconds float64 + MeanWriteSeconds float64 + MedianWriteSeconds float64 + P95WriteSeconds float64 +} + +// AggregatedSince returns the aggregated query metrics for the period +// given from the values provided. +func AggregatedSince(c *collection.Collection, t time.Time) AggregatedWriteBufferMetric { + + // Initialise timing metrics + var instancesCounter []float64 + var waitTimings []float64 + var writeTimings []float64 + + // Retrieve values since the time specified + values, err := c.Since(t) + a := AggregatedWriteBufferMetric{ + InstanceWriteBufferSize: config.Config.InstanceWriteBufferSize, + InstanceFlushIntervalMilliseconds: config.Config.InstanceFlushIntervalMilliseconds, + } + if err != nil { + return a // empty data + } + + // generate the metrics + for _, v := range values { + instancesCounter = append(instancesCounter, float64(v.(*WriteBufferMetric).Instances)) + waitTimings = append(waitTimings, v.(*WriteBufferMetric).WaitLatency.Seconds()) + writeTimings = append(writeTimings, v.(*WriteBufferMetric).WriteLatency.Seconds()) + a.CountInstances += v.(*WriteBufferMetric).Instances + } + + // generate aggregate values + if s, err := stats.Max(stats.Float64Data(instancesCounter)); err == nil { + a.MaxInstances = s + } + if s, err := stats.Mean(stats.Float64Data(instancesCounter)); err == nil { + a.MeanInstances = s + } + if s, err := stats.Median(stats.Float64Data(instancesCounter)); err == nil { + a.MedianInstances = s + } + if s, err := stats.Percentile(stats.Float64Data(instancesCounter), 95); err == nil { + a.P95Instances = s + } + if s, err := stats.Max(stats.Float64Data(waitTimings)); err == nil { + a.MaxWaitSeconds = s + } + if s, err := stats.Mean(stats.Float64Data(waitTimings)); err == nil { + a.MeanWaitSeconds = s + } + if s, err := stats.Median(stats.Float64Data(waitTimings)); err == nil { + a.MedianWaitSeconds = s + } + if s, err := stats.Percentile(stats.Float64Data(waitTimings), 95); err == nil { + a.P95WaitSeconds = s + } + if s, err := stats.Max(stats.Float64Data(writeTimings)); err == nil { + a.MaxWriteSeconds = s + } + if s, err := stats.Mean(stats.Float64Data(writeTimings)); err == nil { + a.MeanWriteSeconds = s + } + if s, err := stats.Median(stats.Float64Data(writeTimings)); err == nil { + a.MedianWriteSeconds = s + } + if s, err := stats.Percentile(stats.Float64Data(writeTimings), 95); err == nil { + a.P95WriteSeconds = s + } + + return a +} diff --git a/go/vt/orchestrator/kv/consul.go b/go/vt/orchestrator/kv/consul.go new file mode 100644 index 0000000000..36d2e3fea9 --- /dev/null +++ b/go/vt/orchestrator/kv/consul.go @@ -0,0 +1,155 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package kv + +import ( + "crypto/tls" + "fmt" + "net/http" + "sync" + "sync/atomic" + + "vitess.io/vitess/go/vt/orchestrator/config" + + consulapi "github.com/armon/consul-api" + "github.com/patrickmn/go-cache" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// A Consul store based on config's `ConsulAddress`, `ConsulScheme`, and `ConsulKVPrefix` +type consulStore struct { + client *consulapi.Client + kvCache *cache.Cache + pairsDistributionSuccessMutex sync.Mutex + distributionReentry int64 +} + +// NewConsulStore creates a new consul store. It is possible that the client for this store is nil, +// which is the case if no consul config is provided. +func NewConsulStore() KVStore { + store := &consulStore{ + kvCache: cache.New(cache.NoExpiration, cache.DefaultExpiration), + } + + if config.Config.ConsulAddress != "" { + consulConfig := consulapi.DefaultConfig() + consulConfig.Address = config.Config.ConsulAddress + consulConfig.Scheme = config.Config.ConsulScheme + if config.Config.ConsulScheme == "https" { + consulConfig.HttpClient = &http.Client{ + Transport: &http.Transport{TLSClientConfig: &tls.Config{InsecureSkipVerify: true}}, + } + } + // ConsulAclToken defaults to "" + consulConfig.Token = config.Config.ConsulAclToken + if client, err := consulapi.NewClient(consulConfig); err != nil { + log.Errore(err) + } else { + store.client = client + } + } + return store +} + +func (this *consulStore) PutKeyValue(key string, value string) (err error) { + if this.client == nil { + return nil + } + pair := &consulapi.KVPair{Key: key, Value: []byte(value)} + _, err = this.client.KV().Put(pair, nil) + return err +} + +func (this *consulStore) GetKeyValue(key string) (value string, found bool, err error) { + if this.client == nil { + return value, found, nil + } + pair, _, err := this.client.KV().Get(key, nil) + if err != nil { + return value, found, err + } + return string(pair.Value), (pair != nil), nil +} + +func (this *consulStore) DistributePairs(kvPairs [](*KVPair)) (err error) { + // This function is non re-entrant (it can only be running once at any point in time) + if atomic.CompareAndSwapInt64(&this.distributionReentry, 0, 1) { + defer atomic.StoreInt64(&this.distributionReentry, 0) + } else { + return + } + + if !config.Config.ConsulCrossDataCenterDistribution { + return nil + } + + datacenters, err := this.client.Catalog().Datacenters() + if err != nil { + return err + } + log.Debugf("consulStore.DistributePairs(): distributing %d pairs to %d datacenters", len(kvPairs), len(datacenters)) + consulPairs := [](*consulapi.KVPair){} + for _, kvPair := range kvPairs { + consulPairs = append(consulPairs, &consulapi.KVPair{Key: kvPair.Key, Value: []byte(kvPair.Value)}) + } + var wg sync.WaitGroup + for _, datacenter := range datacenters { + datacenter := datacenter + wg.Add(1) + go func() { + defer wg.Done() + + writeOptions := &consulapi.WriteOptions{Datacenter: datacenter} + queryOptions := &consulapi.QueryOptions{Datacenter: datacenter} + skipped := 0 + existing := 0 + written := 0 + failed := 0 + + for _, consulPair := range consulPairs { + val := string(consulPair.Value) + kcCacheKey := fmt.Sprintf("%s;%s", datacenter, consulPair.Key) + + if value, found := this.kvCache.Get(kcCacheKey); found && val == value { + skipped++ + continue + } + if pair, _, err := this.client.KV().Get(consulPair.Key, queryOptions); err == nil && pair != nil { + if val == string(pair.Value) { + existing++ + this.kvCache.SetDefault(kcCacheKey, val) + continue + } + } + + if _, e := this.client.KV().Put(consulPair, writeOptions); e != nil { + log.Errorf("consulStore.DistributePairs(): failed %s", kcCacheKey) + failed++ + err = e + } else { + log.Debugf("consulStore.DistributePairs(): written %s=%s", kcCacheKey, val) + written++ + this.kvCache.SetDefault(kcCacheKey, val) + } + } + log.Debugf("consulStore.DistributePairs(): datacenter: %s; skipped: %d, existing: %d, written: %d, failed: %d", datacenter, skipped, existing, written, failed) + }() + } + wg.Wait() + return err +} diff --git a/go/vt/orchestrator/kv/internal.go b/go/vt/orchestrator/kv/internal.go new file mode 100644 index 0000000000..9662ac4901 --- /dev/null +++ b/go/vt/orchestrator/kv/internal.go @@ -0,0 +1,67 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package kv + +import ( + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// Internal key-value store, based on relational backend +type internalKVStore struct { +} + +func NewInternalKVStore() KVStore { + return &internalKVStore{} +} + +func (this *internalKVStore) PutKeyValue(key string, value string) (err error) { + _, err = db.ExecOrchestrator(` + replace + into kv_store ( + store_key, store_value, last_updated + ) values ( + ?, ?, now() + ) + `, key, value, + ) + return log.Errore(err) +} + +func (this *internalKVStore) GetKeyValue(key string) (value string, found bool, err error) { + query := ` + select + store_value + from + kv_store + where + store_key = ? + ` + + err = db.QueryOrchestrator(query, sqlutils.Args(key), func(m sqlutils.RowMap) error { + value = m.GetString("store_value") + found = true + return nil + }) + + return value, found, log.Errore(err) +} + +func (this *internalKVStore) DistributePairs(kvPairs [](*KVPair)) (err error) { + return nil +} diff --git a/go/vt/orchestrator/kv/kv.go b/go/vt/orchestrator/kv/kv.go new file mode 100644 index 0000000000..3eebcbc4de --- /dev/null +++ b/go/vt/orchestrator/kv/kv.go @@ -0,0 +1,101 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package kv + +import ( + "fmt" + "sync" +) + +type KVPair struct { + Key string + Value string +} + +func NewKVPair(key string, value string) *KVPair { + return &KVPair{Key: key, Value: value} +} + +func (this *KVPair) String() string { + return fmt.Sprintf("%s:%s", this.Key, this.Value) +} + +type KVStore interface { + PutKeyValue(key string, value string) (err error) + GetKeyValue(key string) (value string, found bool, err error) + DistributePairs(kvPairs [](*KVPair)) (err error) +} + +var kvMutex sync.Mutex +var kvInitOnce sync.Once +var kvStores = []KVStore{} + +// InitKVStores initializes the KV stores (duh), once in the lifetime of this app. +// Configuration reload does not affect a running instance. +func InitKVStores() { + kvMutex.Lock() + defer kvMutex.Unlock() + + kvInitOnce.Do(func() { + kvStores = []KVStore{ + NewInternalKVStore(), + NewConsulStore(), + NewZkStore(), + } + }) +} + +func getKVStores() (stores []KVStore) { + kvMutex.Lock() + defer kvMutex.Unlock() + + stores = kvStores + return stores +} + +func GetValue(key string) (value string, found bool, err error) { + for _, store := range getKVStores() { + // It's really only the first (internal) that matters here + return store.GetKeyValue(key) + } + return value, found, err +} + +func PutValue(key string, value string) (err error) { + for _, store := range getKVStores() { + if err := store.PutKeyValue(key, value); err != nil { + return err + } + } + return nil +} + +func PutKVPair(kvPair *KVPair) (err error) { + if kvPair == nil { + return nil + } + return PutValue(kvPair.Key, kvPair.Value) +} + +func DistributePairs(kvPairs [](*KVPair)) (err error) { + for _, store := range getKVStores() { + if err := store.DistributePairs(kvPairs); err != nil { + return err + } + } + return nil +} diff --git a/go/vt/orchestrator/kv/zk.go b/go/vt/orchestrator/kv/zk.go new file mode 100644 index 0000000000..707c32c5a0 --- /dev/null +++ b/go/vt/orchestrator/kv/zk.go @@ -0,0 +1,80 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package kv + +import ( + "fmt" + "math/rand" + "strings" + "time" + + zkconstants "github.com/samuel/go-zookeeper/zk" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/zk" +) + +// Internal key-value store, based on relational backend +type zkStore struct { + zook *zk.ZooKeeper +} + +func normalizeKey(key string) (normalizedKey string) { + normalizedKey = strings.TrimLeft(key, "/") + normalizedKey = fmt.Sprintf("/%s", normalizedKey) + return normalizedKey +} + +func NewZkStore() KVStore { + store := &zkStore{} + + if config.Config.ZkAddress != "" { + rand.Seed(time.Now().UnixNano()) + + serversArray := strings.Split(config.Config.ZkAddress, ",") + zook := zk.NewZooKeeper() + zook.SetServers(serversArray) + store.zook = zook + } + return store +} + +func (this *zkStore) PutKeyValue(key string, value string) (err error) { + if this.zook == nil { + return nil + } + + if _, err = this.zook.Set(normalizeKey(key), []byte(value)); err == zkconstants.ErrNoNode { + aclstr := "" + _, err = this.zook.Create(normalizeKey(key), []byte(value), aclstr, true) + } + return err +} + +func (this *zkStore) GetKeyValue(key string) (value string, found bool, err error) { + if this.zook == nil { + return value, false, nil + } + result, err := this.zook.Get(normalizeKey(key)) + if err != nil { + return value, false, err + } + return string(result), true, nil +} + +func (this *zkStore) DistributePairs(kvPairs [](*KVPair)) (err error) { + return nil +} diff --git a/go/vt/orchestrator/logic/command_applier.go b/go/vt/orchestrator/logic/command_applier.go new file mode 100644 index 0000000000..6738de93d4 --- /dev/null +++ b/go/vt/orchestrator/logic/command_applier.go @@ -0,0 +1,311 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +import ( + "encoding/json" + + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/kv" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// AsyncRequest represents an entry in the async_request table +type CommandApplier struct { +} + +func NewCommandApplier() *CommandApplier { + applier := &CommandApplier{} + return applier +} + +func (applier *CommandApplier) ApplyCommand(op string, value []byte) interface{} { + switch op { + case "heartbeat": + return nil + case "async-snapshot": + return applier.asyncSnapshot(value) + case "register-node": + return applier.registerNode(value) + case "discover": + return applier.discover(value) + case "injected-pseudo-gtid": + return applier.injectedPseudoGTID(value) + case "forget": + return applier.forget(value) + case "forget-cluster": + return applier.forgetCluster(value) + case "begin-downtime": + return applier.beginDowntime(value) + case "end-downtime": + return applier.endDowntime(value) + case "register-candidate": + return applier.registerCandidate(value) + case "ack-recovery": + return applier.ackRecovery(value) + case "register-hostname-unresolve": + return applier.registerHostnameUnresolve(value) + case "submit-pool-instances": + return applier.submitPoolInstances(value) + case "register-failure-detection": + return applier.registerFailureDetection(value) + case "write-recovery": + return applier.writeRecovery(value) + case "write-recovery-step": + return applier.writeRecoveryStep(value) + case "resolve-recovery": + return applier.resolveRecovery(value) + case "disable-global-recoveries": + return applier.disableGlobalRecoveries(value) + case "enable-global-recoveries": + return applier.enableGlobalRecoveries(value) + case "put-key-value": + return applier.putKeyValue(value) + case "put-instance-tag": + return applier.putInstanceTag(value) + case "delete-instance-tag": + return applier.deleteInstanceTag(value) + case "leader-uri": + return applier.leaderURI(value) + case "request-health-report": + return applier.healthReport(value) + case "set-cluster-alias-manual-override": + return applier.setClusterAliasManualOverride(value) + } + return log.Errorf("Unknown command op: %s", op) +} + +func (applier *CommandApplier) asyncSnapshot(value []byte) interface{} { + err := orcraft.AsyncSnapshot() + return err +} + +func (applier *CommandApplier) registerNode(value []byte) interface{} { + return nil +} + +func (applier *CommandApplier) discover(value []byte) interface{} { + instanceKey := inst.InstanceKey{} + if err := json.Unmarshal(value, &instanceKey); err != nil { + return log.Errore(err) + } + DiscoverInstance(instanceKey) + return nil +} + +func (applier *CommandApplier) injectedPseudoGTID(value []byte) interface{} { + var clusterName string + if err := json.Unmarshal(value, &clusterName); err != nil { + return log.Errore(err) + } + inst.RegisterInjectedPseudoGTID(clusterName) + return nil +} + +func (applier *CommandApplier) forget(value []byte) interface{} { + instanceKey := inst.InstanceKey{} + if err := json.Unmarshal(value, &instanceKey); err != nil { + return log.Errore(err) + } + err := inst.ForgetInstance(&instanceKey) + return err +} + +func (applier *CommandApplier) forgetCluster(value []byte) interface{} { + var clusterName string + if err := json.Unmarshal(value, &clusterName); err != nil { + return log.Errore(err) + } + err := inst.ForgetCluster(clusterName) + return err +} + +func (applier *CommandApplier) beginDowntime(value []byte) interface{} { + downtime := inst.Downtime{} + if err := json.Unmarshal(value, &downtime); err != nil { + return log.Errore(err) + } + err := inst.BeginDowntime(&downtime) + return err +} + +func (applier *CommandApplier) endDowntime(value []byte) interface{} { + instanceKey := inst.InstanceKey{} + if err := json.Unmarshal(value, &instanceKey); err != nil { + return log.Errore(err) + } + _, err := inst.EndDowntime(&instanceKey) + return err +} + +func (applier *CommandApplier) registerCandidate(value []byte) interface{} { + candidate := inst.CandidateDatabaseInstance{} + if err := json.Unmarshal(value, &candidate); err != nil { + return log.Errore(err) + } + err := inst.RegisterCandidateInstance(&candidate) + return err +} + +func (applier *CommandApplier) ackRecovery(value []byte) interface{} { + ack := RecoveryAcknowledgement{} + err := json.Unmarshal(value, &ack) + if err != nil { + return log.Errore(err) + } + if ack.AllRecoveries { + _, err = AcknowledgeAllRecoveries(ack.Owner, ack.Comment) + } + if ack.ClusterName != "" { + _, err = AcknowledgeClusterRecoveries(ack.ClusterName, ack.Owner, ack.Comment) + } + if ack.Key.IsValid() { + _, err = AcknowledgeInstanceRecoveries(&ack.Key, ack.Owner, ack.Comment) + } + if ack.Id > 0 { + _, err = AcknowledgeRecovery(ack.Id, ack.Owner, ack.Comment) + } + if ack.UID != "" { + _, err = AcknowledgeRecoveryByUID(ack.UID, ack.Owner, ack.Comment) + } + return err +} + +func (applier *CommandApplier) registerHostnameUnresolve(value []byte) interface{} { + registration := inst.HostnameRegistration{} + if err := json.Unmarshal(value, ®istration); err != nil { + return log.Errore(err) + } + err := inst.RegisterHostnameUnresolve(®istration) + return err +} + +func (applier *CommandApplier) submitPoolInstances(value []byte) interface{} { + submission := inst.PoolInstancesSubmission{} + if err := json.Unmarshal(value, &submission); err != nil { + return log.Errore(err) + } + err := inst.ApplyPoolInstances(&submission) + return err +} + +func (applier *CommandApplier) registerFailureDetection(value []byte) interface{} { + analysisEntry := inst.ReplicationAnalysis{} + if err := json.Unmarshal(value, &analysisEntry); err != nil { + return log.Errore(err) + } + _, err := AttemptFailureDetectionRegistration(&analysisEntry) + return err +} + +func (applier *CommandApplier) writeRecovery(value []byte) interface{} { + topologyRecovery := TopologyRecovery{} + if err := json.Unmarshal(value, &topologyRecovery); err != nil { + return log.Errore(err) + } + if _, err := writeTopologyRecovery(&topologyRecovery); err != nil { + return err + } + return nil +} + +func (applier *CommandApplier) writeRecoveryStep(value []byte) interface{} { + topologyRecoveryStep := TopologyRecoveryStep{} + if err := json.Unmarshal(value, &topologyRecoveryStep); err != nil { + return log.Errore(err) + } + err := writeTopologyRecoveryStep(&topologyRecoveryStep) + return err +} + +func (applier *CommandApplier) resolveRecovery(value []byte) interface{} { + topologyRecovery := TopologyRecovery{} + if err := json.Unmarshal(value, &topologyRecovery); err != nil { + return log.Errore(err) + } + if err := writeResolveRecovery(&topologyRecovery); err != nil { + return log.Errore(err) + } + return nil +} + +func (applier *CommandApplier) disableGlobalRecoveries(value []byte) interface{} { + err := DisableRecovery() + return err +} + +func (applier *CommandApplier) enableGlobalRecoveries(value []byte) interface{} { + err := EnableRecovery() + return err +} + +func (applier *CommandApplier) putKeyValue(value []byte) interface{} { + kvPair := kv.KVPair{} + if err := json.Unmarshal(value, &kvPair); err != nil { + return log.Errore(err) + } + err := kv.PutKVPair(&kvPair) + return err +} + +func (applier *CommandApplier) putInstanceTag(value []byte) interface{} { + instanceTag := inst.InstanceTag{} + if err := json.Unmarshal(value, &instanceTag); err != nil { + return log.Errore(err) + } + err := inst.PutInstanceTag(&instanceTag.Key, &instanceTag.T) + return err +} + +func (applier *CommandApplier) deleteInstanceTag(value []byte) interface{} { + instanceTag := inst.InstanceTag{} + if err := json.Unmarshal(value, &instanceTag); err != nil { + return log.Errore(err) + } + _, err := inst.Untag(&instanceTag.Key, &instanceTag.T) + return err +} + +func (applier *CommandApplier) leaderURI(value []byte) interface{} { + var uri string + if err := json.Unmarshal(value, &uri); err != nil { + return log.Errore(err) + } + orcraft.LeaderURI.Set(uri) + return nil +} + +func (applier *CommandApplier) healthReport(value []byte) interface{} { + var authenticationToken string + if err := json.Unmarshal(value, &authenticationToken); err != nil { + return log.Errore(err) + } + orcraft.ReportToRaftLeader(authenticationToken) + return nil +} + +func (applier *CommandApplier) setClusterAliasManualOverride(value []byte) interface{} { + var params [2]string + if err := json.Unmarshal(value, ¶ms); err != nil { + return log.Errore(err) + } + clusterName, alias := params[0], params[1] + err := inst.SetClusterAliasManualOverride(clusterName, alias) + return err +} diff --git a/go/vt/orchestrator/logic/disable_recovery.go b/go/vt/orchestrator/logic/disable_recovery.go new file mode 100644 index 0000000000..384201ef7e --- /dev/null +++ b/go/vt/orchestrator/logic/disable_recovery.go @@ -0,0 +1,85 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +// This file holds wrappers around routines to check if global +// recovery is disabled or not. +// +// This is determined by looking in the table +// orchestrator.global_recovery_disable for a value 1. Note: for +// recoveries to actually happen this must be configured explicitly +// in orchestrator.conf.json. This setting is an emergency brake +// to quickly be able to prevent recoveries happening in some large +// outage type situation. Maybe this value should be cached etc +// but we won't be doing that many recoveries at once so the load +// on this table is expected to be very low. It should be fine to +// go to the database each time. + +import ( + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// IsRecoveryDisabled returns true if Recoveries are disabled globally +func IsRecoveryDisabled() (disabled bool, err error) { + query := ` + SELECT + COUNT(*) as mycount + FROM + global_recovery_disable + WHERE + disable_recovery=? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(1), func(m sqlutils.RowMap) error { + mycount := m.GetInt("mycount") + disabled = (mycount > 0) + return nil + }) + if err != nil { + err = log.Errorf("recovery.IsRecoveryDisabled(): %v", err) + } + return disabled, err +} + +// DisableRecovery ensures recoveries are disabled globally +func DisableRecovery() error { + _, err := db.ExecOrchestrator(` + INSERT IGNORE INTO global_recovery_disable + (disable_recovery) + VALUES (1) + `, + ) + return err +} + +// EnableRecovery ensures recoveries are enabled globally +func EnableRecovery() error { + // The "WHERE" clause is just to avoid full-scan reports by monitoring tools + _, err := db.ExecOrchestrator(` + DELETE FROM global_recovery_disable WHERE disable_recovery >= 0 + `, + ) + return err +} + +func SetRecoveryDisabled(disabled bool) error { + if disabled { + return DisableRecovery() + } + return EnableRecovery() +} diff --git a/go/vt/orchestrator/logic/orchestrator.go b/go/vt/orchestrator/logic/orchestrator.go new file mode 100644 index 0000000000..6a2b3560cc --- /dev/null +++ b/go/vt/orchestrator/logic/orchestrator.go @@ -0,0 +1,668 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +import ( + "fmt" + "math/rand" + "os" + "os/signal" + "sync" + "sync/atomic" + "syscall" + "time" + + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" + "github.com/sjmudd/stopwatch" + "vitess.io/vitess/go/vt/orchestrator/agent" + "vitess.io/vitess/go/vt/orchestrator/collection" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/discovery" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/kv" + ometrics "vitess.io/vitess/go/vt/orchestrator/metrics" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +const ( + discoveryMetricsName = "DISCOVERY_METRICS" + yieldAfterUnhealthyDuration = 5 * config.HealthPollSeconds * time.Second + fatalAfterUnhealthyDuration = 30 * config.HealthPollSeconds * time.Second +) + +// discoveryQueue is a channel of deduplicated instanceKey-s +// that were requested for discovery. It can be continuously updated +// as discovery process progresses. +var discoveryQueue *discovery.Queue +var snapshotDiscoveryKeys chan inst.InstanceKey +var snapshotDiscoveryKeysMutex sync.Mutex + +var discoveriesCounter = metrics.NewCounter() +var failedDiscoveriesCounter = metrics.NewCounter() +var instancePollSecondsExceededCounter = metrics.NewCounter() +var discoveryQueueLengthGauge = metrics.NewGauge() +var discoveryRecentCountGauge = metrics.NewGauge() +var isElectedGauge = metrics.NewGauge() +var isHealthyGauge = metrics.NewGauge() +var isRaftHealthyGauge = metrics.NewGauge() +var isRaftLeaderGauge = metrics.NewGauge() +var discoveryMetrics = collection.CreateOrReturnCollection(discoveryMetricsName) + +var isElectedNode int64 = 0 + +var recentDiscoveryOperationKeys *cache.Cache +var pseudoGTIDPublishCache = cache.New(time.Minute, time.Second) +var kvFoundCache = cache.New(10*time.Minute, time.Minute) + +func init() { + snapshotDiscoveryKeys = make(chan inst.InstanceKey, 10) + + metrics.Register("discoveries.attempt", discoveriesCounter) + metrics.Register("discoveries.fail", failedDiscoveriesCounter) + metrics.Register("discoveries.instance_poll_seconds_exceeded", instancePollSecondsExceededCounter) + metrics.Register("discoveries.queue_length", discoveryQueueLengthGauge) + metrics.Register("discoveries.recent_count", discoveryRecentCountGauge) + metrics.Register("elect.is_elected", isElectedGauge) + metrics.Register("health.is_healthy", isHealthyGauge) + metrics.Register("raft.is_healthy", isRaftHealthyGauge) + metrics.Register("raft.is_leader", isRaftLeaderGauge) + + ometrics.OnMetricsTick(func() { + discoveryQueueLengthGauge.Update(int64(discoveryQueue.QueueLen())) + }) + ometrics.OnMetricsTick(func() { + if recentDiscoveryOperationKeys == nil { + return + } + discoveryRecentCountGauge.Update(int64(recentDiscoveryOperationKeys.ItemCount())) + }) + ometrics.OnMetricsTick(func() { + isElectedGauge.Update(atomic.LoadInt64(&isElectedNode)) + }) + ometrics.OnMetricsTick(func() { + isHealthyGauge.Update(atomic.LoadInt64(&process.LastContinousCheckHealthy)) + }) + ometrics.OnMetricsTick(func() { + var healthy int64 + if orcraft.IsHealthy() { + healthy = 1 + } + isRaftHealthyGauge.Update(healthy) + }) + ometrics.OnMetricsTick(func() { + isRaftLeaderGauge.Update(atomic.LoadInt64(&isElectedNode)) + }) +} + +func IsLeader() bool { + if orcraft.IsRaftEnabled() { + return orcraft.IsLeader() + } + return atomic.LoadInt64(&isElectedNode) == 1 +} + +func IsLeaderOrActive() bool { + if orcraft.IsRaftEnabled() { + return orcraft.IsPartOfQuorum() + } + return atomic.LoadInt64(&isElectedNode) == 1 +} + +// used in several places +func instancePollSecondsDuration() time.Duration { + return time.Duration(config.Config.InstancePollSeconds) * time.Second +} + +// acceptSignals registers for OS signals +func acceptSignals() { + c := make(chan os.Signal, 1) + + signal.Notify(c, syscall.SIGHUP) + signal.Notify(c, syscall.SIGTERM) + go func() { + for sig := range c { + switch sig { + case syscall.SIGHUP: + log.Infof("Received SIGHUP. Reloading configuration") + inst.AuditOperation("reload-configuration", nil, "Triggered via SIGHUP") + config.Reload() + discoveryMetrics.SetExpirePeriod(time.Duration(config.Config.DiscoveryCollectionRetentionSeconds) * time.Second) + case syscall.SIGTERM: + log.Infof("Received SIGTERM. Shutting down orchestrator") + discoveryMetrics.StopAutoExpiration() + // probably should poke other go routines to stop cleanly here ... + inst.AuditOperation("shutdown", nil, "Triggered via SIGTERM") + os.Exit(0) + } + } + }() +} + +// handleDiscoveryRequests iterates the discoveryQueue channel and calls upon +// instance discovery per entry. +func handleDiscoveryRequests() { + discoveryQueue = discovery.CreateOrReturnQueue("DEFAULT") + + // create a pool of discovery workers + for i := uint(0); i < config.Config.DiscoveryMaxConcurrency; i++ { + go func() { + for { + instanceKey := discoveryQueue.Consume() + // Possibly this used to be the elected node, but has + // been demoted, while still the queue is full. + if !IsLeaderOrActive() { + log.Debugf("Node apparently demoted. Skipping discovery of %+v. "+ + "Remaining queue size: %+v", instanceKey, discoveryQueue.QueueLen()) + discoveryQueue.Release(instanceKey) + continue + } + + DiscoverInstance(instanceKey) + discoveryQueue.Release(instanceKey) + } + }() + } +} + +// DiscoverInstance will attempt to discover (poll) an instance (unless +// it is already up to date) and will also ensure that its master and +// replicas (if any) are also checked. +func DiscoverInstance(instanceKey inst.InstanceKey) { + if inst.InstanceIsForgotten(&instanceKey) { + log.Debugf("discoverInstance: skipping discovery of %+v because it is set to be forgotten", instanceKey) + return + } + if inst.RegexpMatchPatterns(instanceKey.StringCode(), config.Config.DiscoveryIgnoreHostnameFilters) { + log.Debugf("discoverInstance: skipping discovery of %+v because it matches DiscoveryIgnoreHostnameFilters", instanceKey) + return + } + + // create stopwatch entries + latency := stopwatch.NewNamedStopwatch() + latency.AddMany([]string{ + "backend", + "instance", + "total"}) + latency.Start("total") // start the total stopwatch (not changed anywhere else) + + defer func() { + latency.Stop("total") + discoveryTime := latency.Elapsed("total") + if discoveryTime > instancePollSecondsDuration() { + instancePollSecondsExceededCounter.Inc(1) + log.Warningf("discoverInstance exceeded InstancePollSeconds for %+v, took %.4fs", instanceKey, discoveryTime.Seconds()) + } + }() + + instanceKey.ResolveHostname() + if !instanceKey.IsValid() { + return + } + + // Calculate the expiry period each time as InstancePollSeconds + // _may_ change during the run of the process (via SIGHUP) and + // it is not possible to change the cache's default expiry.. + if existsInCacheError := recentDiscoveryOperationKeys.Add(instanceKey.DisplayString(), true, instancePollSecondsDuration()); existsInCacheError != nil { + // Just recently attempted + return + } + + latency.Start("backend") + instance, found, err := inst.ReadInstance(&instanceKey) + latency.Stop("backend") + if found && instance.IsUpToDate && instance.IsLastCheckValid { + // we've already discovered this one. Skip! + return + } + + discoveriesCounter.Inc(1) + + // First we've ever heard of this instance. Continue investigation: + instance, err = inst.ReadTopologyInstanceBufferable(&instanceKey, config.Config.BufferInstanceWrites, latency) + // panic can occur (IO stuff). Therefore it may happen + // that instance is nil. Check it, but first get the timing metrics. + totalLatency := latency.Elapsed("total") + backendLatency := latency.Elapsed("backend") + instanceLatency := latency.Elapsed("instance") + + if instance == nil { + failedDiscoveriesCounter.Inc(1) + discoveryMetrics.Append(&discovery.Metric{ + Timestamp: time.Now(), + InstanceKey: instanceKey, + TotalLatency: totalLatency, + BackendLatency: backendLatency, + InstanceLatency: instanceLatency, + Err: err, + }) + if util.ClearToLog("discoverInstance", instanceKey.StringCode()) { + log.Warningf(" DiscoverInstance(%+v) instance is nil in %.3fs (Backend: %.3fs, Instance: %.3fs), error=%+v", + instanceKey, + totalLatency.Seconds(), + backendLatency.Seconds(), + instanceLatency.Seconds(), + err) + } + return + } + + discoveryMetrics.Append(&discovery.Metric{ + Timestamp: time.Now(), + InstanceKey: instanceKey, + TotalLatency: totalLatency, + BackendLatency: backendLatency, + InstanceLatency: instanceLatency, + Err: nil, + }) + + if !IsLeaderOrActive() { + // Maybe this node was elected before, but isn't elected anymore. + // If not elected, stop drilling up/down the topology + return + } + + // Investigate replicas and members of the same replication group: + for _, replicaKey := range append(instance.ReplicationGroupMembers.GetInstanceKeys(), instance.Replicas.GetInstanceKeys()...) { + replicaKey := replicaKey // not needed? no concurrency here? + + // Avoid noticing some hosts we would otherwise discover + if inst.RegexpMatchPatterns(replicaKey.StringCode(), config.Config.DiscoveryIgnoreReplicaHostnameFilters) { + continue + } + + if replicaKey.IsValid() { + discoveryQueue.Push(replicaKey) + } + } + // Investigate master: + if instance.MasterKey.IsValid() { + if !inst.RegexpMatchPatterns(instance.MasterKey.StringCode(), config.Config.DiscoveryIgnoreMasterHostnameFilters) { + discoveryQueue.Push(instance.MasterKey) + } + } +} + +// onHealthTick handles the actions to take to discover/poll instances +func onHealthTick() { + wasAlreadyElected := IsLeader() + + if orcraft.IsRaftEnabled() { + if orcraft.IsLeader() { + atomic.StoreInt64(&isElectedNode, 1) + } else { + atomic.StoreInt64(&isElectedNode, 0) + } + if process.SinceLastGoodHealthCheck() > yieldAfterUnhealthyDuration { + log.Errorf("Heath test is failing for over %+v seconds. raft yielding", yieldAfterUnhealthyDuration.Seconds()) + orcraft.Yield() + } + if process.SinceLastGoodHealthCheck() > fatalAfterUnhealthyDuration { + orcraft.FatalRaftError(fmt.Errorf("Node is unable to register health. Please check database connnectivity.")) + } + } + if !orcraft.IsRaftEnabled() { + myIsElectedNode, err := process.AttemptElection() + if err != nil { + log.Errore(err) + } + if myIsElectedNode { + atomic.StoreInt64(&isElectedNode, 1) + } else { + atomic.StoreInt64(&isElectedNode, 0) + } + if !myIsElectedNode { + if electedNode, _, err := process.ElectedNode(); err == nil { + log.Infof("Not elected as active node; active node: %v; polling", electedNode.Hostname) + } else { + log.Infof("Not elected as active node; active node: Unable to determine: %v; polling", err) + } + } + } + if !IsLeaderOrActive() { + return + } + instanceKeys, err := inst.ReadOutdatedInstanceKeys() + if err != nil { + log.Errore(err) + } + + if !wasAlreadyElected { + // Just turned to be leader! + go process.RegisterNode(process.ThisNodeHealth) + go inst.ExpireMaintenance() + } + + func() { + // Normally onHealthTick() shouldn't run concurrently. It is kicked by a ticker. + // However it _is_ invoked inside a goroutine. I like to be safe here. + snapshotDiscoveryKeysMutex.Lock() + defer snapshotDiscoveryKeysMutex.Unlock() + + countSnapshotKeys := len(snapshotDiscoveryKeys) + for i := 0; i < countSnapshotKeys; i++ { + instanceKeys = append(instanceKeys, <-snapshotDiscoveryKeys) + } + }() + // avoid any logging unless there's something to be done + if len(instanceKeys) > 0 { + for _, instanceKey := range instanceKeys { + if instanceKey.IsValid() { + discoveryQueue.Push(instanceKey) + } + } + } +} + +// publishDiscoverMasters will publish to raft a discovery request for all known masters. +// This makes for a best-effort keep-in-sync between raft nodes, where some may have +// inconsistent data due to hosts being forgotten, for example. +func publishDiscoverMasters() error { + instances, err := inst.ReadWriteableClustersMasters() + if err == nil { + for _, instance := range instances { + key := instance.Key + go orcraft.PublishCommand("discover", key) + } + } + return log.Errore(err) +} + +// InjectPseudoGTIDOnWriters will inject a PseudoGTID entry on all writable, accessible, +// supported writers. +func InjectPseudoGTIDOnWriters() error { + instances, err := inst.ReadWriteableClustersMasters() + if err != nil { + return log.Errore(err) + } + for i := range rand.Perm(len(instances)) { + instance := instances[i] + go func() { + if injected, _ := inst.CheckAndInjectPseudoGTIDOnWriter(instance); injected { + clusterName := instance.ClusterName + if orcraft.IsRaftEnabled() { + // We prefer not saturating our raft communication. Pseudo-GTID information is + // OK to be cached for a while. + if _, found := pseudoGTIDPublishCache.Get(clusterName); !found { + pseudoGTIDPublishCache.Set(clusterName, true, cache.DefaultExpiration) + orcraft.PublishCommand("injected-pseudo-gtid", clusterName) + } + } else { + inst.RegisterInjectedPseudoGTID(clusterName) + } + } + }() + } + return nil +} + +// Write a cluster's master (or all clusters masters) to kv stores. +// This should generally only happen once in a lifetime of a cluster. Otherwise KV +// stores are updated via failovers. +func SubmitMastersToKvStores(clusterName string, force bool) (kvPairs [](*kv.KVPair), submittedCount int, err error) { + kvPairs, err = inst.GetMastersKVPairs(clusterName) + log.Debugf("kv.SubmitMastersToKvStores, clusterName: %s, force: %+v: numPairs: %+v", clusterName, force, len(kvPairs)) + if err != nil { + return kvPairs, submittedCount, log.Errore(err) + } + var selectedError error + var submitKvPairs [](*kv.KVPair) + for _, kvPair := range kvPairs { + if !force { + // !force: Called periodically to auto-populate KV + // We'd like to avoid some overhead. + if _, found := kvFoundCache.Get(kvPair.Key); found { + // Let's not overload database with queries. Let's not overload raft with events. + continue + } + v, found, err := kv.GetValue(kvPair.Key) + if err == nil && found && v == kvPair.Value { + // Already has the right value. + kvFoundCache.Set(kvPair.Key, true, cache.DefaultExpiration) + continue + } + } + submitKvPairs = append(submitKvPairs, kvPair) + } + log.Debugf("kv.SubmitMastersToKvStores: submitKvPairs: %+v", len(submitKvPairs)) + for _, kvPair := range submitKvPairs { + if orcraft.IsRaftEnabled() { + _, err = orcraft.PublishCommand("put-key-value", kvPair) + } else { + err = kv.PutKVPair(kvPair) + } + if err == nil { + submittedCount++ + } else { + selectedError = err + } + } + if err := kv.DistributePairs(kvPairs); err != nil { + log.Errore(err) + } + return kvPairs, submittedCount, log.Errore(selectedError) +} + +func injectSeeds(seedOnce *sync.Once) { + seedOnce.Do(func() { + for _, seed := range config.Config.DiscoverySeeds { + instanceKey, err := inst.ParseRawInstanceKey(seed) + if err == nil { + inst.InjectSeed(instanceKey) + } else { + log.Errorf("Error parsing seed %s: %+v", seed, err) + } + } + }) +} + +// ContinuousDiscovery starts an asynchronuous infinite discovery process where instances are +// periodically investigated and their status captured, and long since unseen instances are +// purged and forgotten. +func ContinuousDiscovery() { + log.Infof("continuous discovery: setting up") + continuousDiscoveryStartTime := time.Now() + checkAndRecoverWaitPeriod := 3 * instancePollSecondsDuration() + recentDiscoveryOperationKeys = cache.New(instancePollSecondsDuration(), time.Second) + + inst.LoadHostnameResolveCache() + go handleDiscoveryRequests() + + healthTick := time.Tick(config.HealthPollSeconds * time.Second) + instancePollTick := time.Tick(instancePollSecondsDuration()) + caretakingTick := time.Tick(time.Minute) + raftCaretakingTick := time.Tick(10 * time.Minute) + recoveryTick := time.Tick(time.Duration(config.RecoveryPollSeconds) * time.Second) + autoPseudoGTIDTick := time.Tick(time.Duration(config.PseudoGTIDIntervalSeconds) * time.Second) + var recoveryEntrance int64 + var snapshotTopologiesTick <-chan time.Time + if config.Config.SnapshotTopologiesIntervalHours > 0 { + snapshotTopologiesTick = time.Tick(time.Duration(config.Config.SnapshotTopologiesIntervalHours) * time.Hour) + } + + runCheckAndRecoverOperationsTimeRipe := func() bool { + return time.Since(continuousDiscoveryStartTime) >= checkAndRecoverWaitPeriod + } + + var seedOnce sync.Once + + go ometrics.InitMetrics() + go ometrics.InitGraphiteMetrics() + go acceptSignals() + go kv.InitKVStores() + if config.Config.RaftEnabled { + if err := orcraft.Setup(NewCommandApplier(), NewSnapshotDataCreatorApplier(), process.ThisHostname); err != nil { + log.Fatale(err) + } + go orcraft.Monitor() + } + + if *config.RuntimeCLIFlags.GrabElection { + process.GrabElection() + } + + log.Infof("continuous discovery: starting") + for { + select { + case <-healthTick: + go func() { + onHealthTick() + }() + case <-instancePollTick: + go func() { + // This tick does NOT do instance poll (these are handled by the oversampling discoveryTick) + // But rather should invoke such routinely operations that need to be as (or roughly as) frequent + // as instance poll + if IsLeaderOrActive() { + go inst.UpdateClusterAliases() + go inst.ExpireDowntime() + go injectSeeds(&seedOnce) + } + }() + case <-autoPseudoGTIDTick: + go func() { + if config.Config.AutoPseudoGTID && IsLeader() { + go InjectPseudoGTIDOnWriters() + } + }() + case <-caretakingTick: + // Various periodic internal maintenance tasks + go func() { + if IsLeaderOrActive() { + go inst.RecordInstanceCoordinatesHistory() + go inst.ReviewUnseenInstances() + go inst.InjectUnseenMasters() + + go inst.ForgetLongUnseenInstances() + go inst.ForgetUnseenInstancesDifferentlyResolved() + go inst.ForgetExpiredHostnameResolves() + go inst.DeleteInvalidHostnameResolves() + go inst.ResolveUnknownMasterHostnameResolves() + go inst.ExpireMaintenance() + go inst.ExpireCandidateInstances() + go inst.ExpireHostnameUnresolve() + go inst.ExpireClusterDomainName() + go inst.ExpireAudit() + go inst.ExpireMasterPositionEquivalence() + go inst.ExpirePoolInstances() + go inst.FlushNontrivialResolveCacheToDatabase() + go inst.ExpireInjectedPseudoGTID() + go inst.ExpireStaleInstanceBinlogCoordinates() + go process.ExpireNodesHistory() + go process.ExpireAccessTokens() + go process.ExpireAvailableNodes() + go ExpireFailureDetectionHistory() + go ExpireTopologyRecoveryHistory() + go ExpireTopologyRecoveryStepsHistory() + + if runCheckAndRecoverOperationsTimeRipe() && IsLeader() { + go SubmitMastersToKvStores("", false) + } + } else { + // Take this opportunity to refresh yourself + go inst.LoadHostnameResolveCache() + } + }() + case <-raftCaretakingTick: + if orcraft.IsRaftEnabled() && orcraft.IsLeader() { + go publishDiscoverMasters() + } + case <-recoveryTick: + go func() { + if IsLeaderOrActive() { + go ClearActiveFailureDetections() + go ClearActiveRecoveries() + go ExpireBlockedRecoveries() + go AcknowledgeCrashedRecoveries() + go inst.ExpireInstanceAnalysisChangelog() + + go func() { + // This function is non re-entrant (it can only be running once at any point in time) + if atomic.CompareAndSwapInt64(&recoveryEntrance, 0, 1) { + defer atomic.StoreInt64(&recoveryEntrance, 0) + } else { + return + } + if runCheckAndRecoverOperationsTimeRipe() { + CheckAndRecover(nil, nil, false) + } else { + log.Debugf("Waiting for %+v seconds to pass before running failure detection/recovery", checkAndRecoverWaitPeriod.Seconds()) + } + }() + } + }() + case <-snapshotTopologiesTick: + go func() { + if IsLeaderOrActive() { + go inst.SnapshotTopologies() + } + }() + } + } +} + +func pollAgent(hostname string) error { + polledAgent, err := agent.GetAgent(hostname) + agent.UpdateAgentLastChecked(hostname) + + if err != nil { + return log.Errore(err) + } + + err = agent.UpdateAgentInfo(hostname, polledAgent) + if err != nil { + return log.Errore(err) + } + + return nil +} + +// ContinuousAgentsPoll starts an asynchronuous infinite process where agents are +// periodically investigated and their status captured, and long since unseen agents are +// purged and forgotten. +func ContinuousAgentsPoll() { + log.Infof("Starting continuous agents poll") + + go discoverSeededAgents() + + tick := time.Tick(config.HealthPollSeconds * time.Second) + caretakingTick := time.Tick(time.Hour) + for range tick { + agentsHosts, _ := agent.ReadOutdatedAgentsHosts() + log.Debugf("outdated agents hosts: %+v", agentsHosts) + for _, hostname := range agentsHosts { + go pollAgent(hostname) + } + // See if we should also forget agents (lower frequency) + select { + case <-caretakingTick: + agent.ForgetLongUnseenAgents() + agent.FailStaleSeeds() + default: + } + } +} + +func discoverSeededAgents() { + for seededAgent := range agent.SeededAgents { + instanceKey := &inst.InstanceKey{Hostname: seededAgent.Hostname, Port: int(seededAgent.MySQLPort)} + go inst.ReadTopologyInstance(instanceKey) + } +} diff --git a/go/vt/orchestrator/logic/snapshot_data.go b/go/vt/orchestrator/logic/snapshot_data.go new file mode 100644 index 0000000000..e131f3df30 --- /dev/null +++ b/go/vt/orchestrator/logic/snapshot_data.go @@ -0,0 +1,218 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +import ( + "bytes" + "compress/gzip" + "encoding/json" + "io" + + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/inst" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +type SnapshotData struct { + Keys []inst.InstanceKey // Kept for backwards comapatibility + MinimalInstances []inst.MinimalInstance + RecoveryDisabled bool + + ClusterAlias, + ClusterAliasOverride, + ClusterDomainName, + HostAttributes, + InstanceTags, + AccessToken, + PoolInstances, + InjectedPseudoGTIDClusters, + HostnameResolves, + HostnameUnresolves, + DowntimedInstances, + Candidates, + Detections, + KVStore, + Recovery, + RecoverySteps sqlutils.NamedResultData + + LeaderURI string +} + +func NewSnapshotData() *SnapshotData { + return &SnapshotData{} +} + +func readTableData(tableName string, data *sqlutils.NamedResultData) error { + orcdb, err := db.OpenOrchestrator() + if err != nil { + return log.Errore(err) + } + *data, err = sqlutils.ScanTable(orcdb, tableName) + return log.Errore(err) +} + +func writeTableData(tableName string, data *sqlutils.NamedResultData) error { + orcdb, err := db.OpenOrchestrator() + if err != nil { + return log.Errore(err) + } + err = sqlutils.WriteTable(orcdb, tableName, *data) + return log.Errore(err) +} + +func CreateSnapshotData() *SnapshotData { + snapshotData := NewSnapshotData() + + snapshotData.LeaderURI = orcraft.LeaderURI.Get() + // keys + snapshotData.Keys, _ = inst.ReadAllInstanceKeys() + snapshotData.MinimalInstances, _ = inst.ReadAllMinimalInstances() + snapshotData.RecoveryDisabled, _ = IsRecoveryDisabled() + + readTableData("cluster_alias", &snapshotData.ClusterAlias) + readTableData("cluster_alias_override", &snapshotData.ClusterAliasOverride) + readTableData("cluster_domain_name", &snapshotData.ClusterDomainName) + readTableData("access_token", &snapshotData.AccessToken) + readTableData("host_attributes", &snapshotData.HostAttributes) + readTableData("database_instance_tags", &snapshotData.InstanceTags) + readTableData("database_instance_pool", &snapshotData.PoolInstances) + readTableData("hostname_resolve", &snapshotData.HostnameResolves) + readTableData("hostname_unresolve", &snapshotData.HostnameUnresolves) + readTableData("database_instance_downtime", &snapshotData.DowntimedInstances) + readTableData("candidate_database_instance", &snapshotData.Candidates) + readTableData("topology_failure_detection", &snapshotData.Detections) + readTableData("kv_store", &snapshotData.KVStore) + readTableData("topology_recovery", &snapshotData.Recovery) + readTableData("topology_recovery_steps", &snapshotData.RecoverySteps) + readTableData("cluster_injected_pseudo_gtid", &snapshotData.InjectedPseudoGTIDClusters) + + log.Debugf("raft snapshot data created") + return snapshotData +} + +type SnapshotDataCreatorApplier struct { +} + +func NewSnapshotDataCreatorApplier() *SnapshotDataCreatorApplier { + generator := &SnapshotDataCreatorApplier{} + return generator +} + +func (this *SnapshotDataCreatorApplier) GetData() (data []byte, err error) { + snapshotData := CreateSnapshotData() + b, err := json.Marshal(snapshotData) + if err != nil { + return b, err + } + var buf bytes.Buffer + zw := gzip.NewWriter(&buf) + if _, err := zw.Write(b); err != nil { + return b, err + } + if err := zw.Close(); err != nil { + return b, err + } + return buf.Bytes(), nil +} + +func (this *SnapshotDataCreatorApplier) Restore(rc io.ReadCloser) error { + snapshotData := NewSnapshotData() + zr, err := gzip.NewReader(rc) + if err != nil { + return err + } + if err := json.NewDecoder(zr).Decode(&snapshotData); err != nil { + return err + } + + orcraft.LeaderURI.Set(snapshotData.LeaderURI) + // keys + { + snapshotInstanceKeyMap := inst.NewInstanceKeyMap() + snapshotInstanceKeyMap.AddKeys(snapshotData.Keys) + for _, minimalInstance := range snapshotData.MinimalInstances { + snapshotInstanceKeyMap.AddKey(minimalInstance.Key) + } + + discardedKeys := 0 + // Forget instances that were not in snapshot + existingKeys, _ := inst.ReadAllInstanceKeys() + for _, existingKey := range existingKeys { + if !snapshotInstanceKeyMap.HasKey(existingKey) { + inst.ForgetInstance(&existingKey) + discardedKeys++ + } + } + log.Debugf("raft snapshot restore: discarded %+v keys", discardedKeys) + existingKeysMap := inst.NewInstanceKeyMap() + existingKeysMap.AddKeys(existingKeys) + + // Discover instances that are in snapshot and not in our own database. + // Instances that _are_ in our own database will self-discover. No need + // to explicitly discover them. + discoveredKeys := 0 + // v2: read keys + master keys + for _, minimalInstance := range snapshotData.MinimalInstances { + if !existingKeysMap.HasKey(minimalInstance.Key) { + if err := inst.WriteInstance(minimalInstance.ToInstance(), false, nil); err == nil { + discoveredKeys++ + } else { + log.Errore(err) + } + } + } + if len(snapshotData.MinimalInstances) == 0 { + // v1: read keys (backwards support) + for _, snapshotKey := range snapshotData.Keys { + if !existingKeysMap.HasKey(snapshotKey) { + snapshotKey := snapshotKey + go func() { + snapshotDiscoveryKeys <- snapshotKey + }() + discoveredKeys++ + } + } + } + log.Debugf("raft snapshot restore: discovered %+v keys", discoveredKeys) + } + writeTableData("cluster_alias", &snapshotData.ClusterAlias) + writeTableData("cluster_alias_override", &snapshotData.ClusterAliasOverride) + writeTableData("cluster_domain_name", &snapshotData.ClusterDomainName) + writeTableData("access_token", &snapshotData.AccessToken) + writeTableData("host_attributes", &snapshotData.HostAttributes) + writeTableData("database_instance_tags", &snapshotData.InstanceTags) + writeTableData("database_instance_pool", &snapshotData.PoolInstances) + writeTableData("hostname_resolve", &snapshotData.HostnameResolves) + writeTableData("hostname_unresolve", &snapshotData.HostnameUnresolves) + writeTableData("database_instance_downtime", &snapshotData.DowntimedInstances) + writeTableData("candidate_database_instance", &snapshotData.Candidates) + writeTableData("kv_store", &snapshotData.KVStore) + writeTableData("topology_recovery", &snapshotData.Recovery) + writeTableData("topology_failure_detection", &snapshotData.Detections) + writeTableData("topology_recovery_steps", &snapshotData.RecoverySteps) + writeTableData("cluster_injected_pseudo_gtid", &snapshotData.InjectedPseudoGTIDClusters) + + // recovery disable + { + SetRecoveryDisabled(snapshotData.RecoveryDisabled) + } + log.Debugf("raft snapshot restore applied") + return nil +} diff --git a/go/vt/orchestrator/logic/topology_recovery.go b/go/vt/orchestrator/logic/topology_recovery.go new file mode 100644 index 0000000000..858c989d6b --- /dev/null +++ b/go/vt/orchestrator/logic/topology_recovery.go @@ -0,0 +1,2020 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +import ( + "encoding/json" + "fmt" + "math/rand" + goos "os" + "sort" + "strings" + "sync/atomic" + "time" + + "github.com/patrickmn/go-cache" + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/attributes" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/kv" + ometrics "vitess.io/vitess/go/vt/orchestrator/metrics" + "vitess.io/vitess/go/vt/orchestrator/os" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +var countPendingRecoveries int64 + +type RecoveryType string + +const ( + MasterRecovery RecoveryType = "MasterRecovery" + CoMasterRecovery = "CoMasterRecovery" + IntermediateMasterRecovery = "IntermediateMasterRecovery" +) + +type RecoveryAcknowledgement struct { + CreatedAt time.Time + Owner string + Comment string + + Key inst.InstanceKey + ClusterName string + Id int64 + UID string + AllRecoveries bool +} + +func NewRecoveryAcknowledgement(owner string, comment string) *RecoveryAcknowledgement { + return &RecoveryAcknowledgement{ + CreatedAt: time.Now(), + Owner: owner, + Comment: comment, + } +} + +func NewInternalAcknowledgement() *RecoveryAcknowledgement { + return &RecoveryAcknowledgement{ + CreatedAt: time.Now(), + Owner: "orchestrator", + Comment: "internal", + } +} + +// BlockedTopologyRecovery represents an entry in the blocked_topology_recovery table +type BlockedTopologyRecovery struct { + FailedInstanceKey inst.InstanceKey + ClusterName string + Analysis inst.AnalysisCode + LastBlockedTimestamp string + BlockingRecoveryId int64 +} + +// TopologyRecovery represents an entry in the topology_recovery table +type TopologyRecovery struct { + inst.PostponedFunctionsContainer + + Id int64 + UID string + AnalysisEntry inst.ReplicationAnalysis + SuccessorKey *inst.InstanceKey + SuccessorAlias string + IsActive bool + IsSuccessful bool + LostReplicas inst.InstanceKeyMap + ParticipatingInstanceKeys inst.InstanceKeyMap + AllErrors []string + RecoveryStartTimestamp string + RecoveryEndTimestamp string + ProcessingNodeHostname string + ProcessingNodeToken string + Acknowledged bool + AcknowledgedAt string + AcknowledgedBy string + AcknowledgedComment string + LastDetectionId int64 + RelatedRecoveryId int64 + Type RecoveryType + RecoveryType MasterRecoveryType +} + +func NewTopologyRecovery(replicationAnalysis inst.ReplicationAnalysis) *TopologyRecovery { + topologyRecovery := &TopologyRecovery{} + topologyRecovery.UID = util.PrettyUniqueToken() + topologyRecovery.AnalysisEntry = replicationAnalysis + topologyRecovery.SuccessorKey = nil + topologyRecovery.LostReplicas = *inst.NewInstanceKeyMap() + topologyRecovery.ParticipatingInstanceKeys = *inst.NewInstanceKeyMap() + topologyRecovery.AllErrors = []string{} + topologyRecovery.RecoveryType = NotMasterRecovery + return topologyRecovery +} + +func (this *TopologyRecovery) AddError(err error) error { + if err != nil { + this.AllErrors = append(this.AllErrors, err.Error()) + } + return err +} + +func (this *TopologyRecovery) AddErrors(errs []error) { + for _, err := range errs { + this.AddError(err) + } +} + +type TopologyRecoveryStep struct { + Id int64 + RecoveryUID string + AuditAt string + Message string +} + +func NewTopologyRecoveryStep(uid string, message string) *TopologyRecoveryStep { + return &TopologyRecoveryStep{ + RecoveryUID: uid, + Message: message, + } +} + +type MasterRecoveryType string + +const ( + NotMasterRecovery MasterRecoveryType = "NotMasterRecovery" + MasterRecoveryGTID = "MasterRecoveryGTID" + MasterRecoveryPseudoGTID = "MasterRecoveryPseudoGTID" + MasterRecoveryBinlogServer = "MasterRecoveryBinlogServer" +) + +var emergencyReadTopologyInstanceMap *cache.Cache +var emergencyRestartReplicaTopologyInstanceMap *cache.Cache +var emergencyOperationGracefulPeriodMap *cache.Cache + +// InstancesByCountReplicas sorts instances by umber of replicas, descending +type InstancesByCountReplicas [](*inst.Instance) + +func (this InstancesByCountReplicas) Len() int { return len(this) } +func (this InstancesByCountReplicas) Swap(i, j int) { this[i], this[j] = this[j], this[i] } +func (this InstancesByCountReplicas) Less(i, j int) bool { + if len(this[i].Replicas) == len(this[j].Replicas) { + // Secondary sorting: prefer more advanced replicas + return !this[i].ExecBinlogCoordinates.SmallerThan(&this[j].ExecBinlogCoordinates) + } + return len(this[i].Replicas) < len(this[j].Replicas) +} + +var recoverDeadMasterCounter = metrics.NewCounter() +var recoverDeadMasterSuccessCounter = metrics.NewCounter() +var recoverDeadMasterFailureCounter = metrics.NewCounter() +var recoverDeadIntermediateMasterCounter = metrics.NewCounter() +var recoverDeadIntermediateMasterSuccessCounter = metrics.NewCounter() +var recoverDeadIntermediateMasterFailureCounter = metrics.NewCounter() +var recoverDeadCoMasterCounter = metrics.NewCounter() +var recoverDeadCoMasterSuccessCounter = metrics.NewCounter() +var recoverDeadCoMasterFailureCounter = metrics.NewCounter() +var countPendingRecoveriesGauge = metrics.NewGauge() + +func init() { + metrics.Register("recover.dead_master.start", recoverDeadMasterCounter) + metrics.Register("recover.dead_master.success", recoverDeadMasterSuccessCounter) + metrics.Register("recover.dead_master.fail", recoverDeadMasterFailureCounter) + metrics.Register("recover.dead_intermediate_master.start", recoverDeadIntermediateMasterCounter) + metrics.Register("recover.dead_intermediate_master.success", recoverDeadIntermediateMasterSuccessCounter) + metrics.Register("recover.dead_intermediate_master.fail", recoverDeadIntermediateMasterFailureCounter) + metrics.Register("recover.dead_co_master.start", recoverDeadCoMasterCounter) + metrics.Register("recover.dead_co_master.success", recoverDeadCoMasterSuccessCounter) + metrics.Register("recover.dead_co_master.fail", recoverDeadCoMasterFailureCounter) + metrics.Register("recover.pending", countPendingRecoveriesGauge) + + go initializeTopologyRecoveryPostConfiguration() + + ometrics.OnMetricsTick(func() { + countPendingRecoveriesGauge.Update(getCountPendingRecoveries()) + }) +} + +func getCountPendingRecoveries() int64 { + return atomic.LoadInt64(&countPendingRecoveries) +} + +func initializeTopologyRecoveryPostConfiguration() { + config.WaitForConfigurationToBeLoaded() + + emergencyReadTopologyInstanceMap = cache.New(time.Second, time.Millisecond*250) + emergencyRestartReplicaTopologyInstanceMap = cache.New(time.Second*30, time.Second) + emergencyOperationGracefulPeriodMap = cache.New(time.Second*5, time.Millisecond*500) +} + +// AuditTopologyRecovery audits a single step in a topology recovery process. +func AuditTopologyRecovery(topologyRecovery *TopologyRecovery, message string) error { + log.Infof("topology_recovery: %s", message) + if topologyRecovery == nil { + return nil + } + + recoveryStep := NewTopologyRecoveryStep(topologyRecovery.UID, message) + if orcraft.IsRaftEnabled() { + _, err := orcraft.PublishCommand("write-recovery-step", recoveryStep) + return err + } else { + return writeTopologyRecoveryStep(recoveryStep) + } +} + +func resolveRecovery(topologyRecovery *TopologyRecovery, successorInstance *inst.Instance) error { + if successorInstance != nil { + topologyRecovery.SuccessorKey = &successorInstance.Key + topologyRecovery.SuccessorAlias = successorInstance.InstanceAlias + topologyRecovery.IsSuccessful = true + } + if orcraft.IsRaftEnabled() { + _, err := orcraft.PublishCommand("resolve-recovery", topologyRecovery) + return err + } else { + return writeResolveRecovery(topologyRecovery) + } +} + +// prepareCommand replaces agreed-upon placeholders with analysis data +func prepareCommand(command string, topologyRecovery *TopologyRecovery) (result string, async bool) { + analysisEntry := &topologyRecovery.AnalysisEntry + command = strings.TrimSpace(command) + if strings.HasSuffix(command, "&") { + command = strings.TrimRight(command, "&") + async = true + } + command = strings.Replace(command, "{failureType}", string(analysisEntry.Analysis), -1) + command = strings.Replace(command, "{instanceType}", string(analysisEntry.GetAnalysisInstanceType()), -1) + command = strings.Replace(command, "{isMaster}", fmt.Sprintf("%t", analysisEntry.IsMaster), -1) + command = strings.Replace(command, "{isCoMaster}", fmt.Sprintf("%t", analysisEntry.IsCoMaster), -1) + command = strings.Replace(command, "{failureDescription}", analysisEntry.Description, -1) + command = strings.Replace(command, "{command}", analysisEntry.CommandHint, -1) + command = strings.Replace(command, "{failedHost}", analysisEntry.AnalyzedInstanceKey.Hostname, -1) + command = strings.Replace(command, "{failedPort}", fmt.Sprintf("%d", analysisEntry.AnalyzedInstanceKey.Port), -1) + command = strings.Replace(command, "{failureCluster}", analysisEntry.ClusterDetails.ClusterName, -1) + command = strings.Replace(command, "{failureClusterAlias}", analysisEntry.ClusterDetails.ClusterAlias, -1) + command = strings.Replace(command, "{failureClusterDomain}", analysisEntry.ClusterDetails.ClusterDomain, -1) + command = strings.Replace(command, "{countSlaves}", fmt.Sprintf("%d", analysisEntry.CountReplicas), -1) + command = strings.Replace(command, "{countReplicas}", fmt.Sprintf("%d", analysisEntry.CountReplicas), -1) + command = strings.Replace(command, "{isDowntimed}", fmt.Sprint(analysisEntry.IsDowntimed), -1) + command = strings.Replace(command, "{autoMasterRecovery}", fmt.Sprint(analysisEntry.ClusterDetails.HasAutomatedMasterRecovery), -1) + command = strings.Replace(command, "{autoIntermediateMasterRecovery}", fmt.Sprint(analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery), -1) + command = strings.Replace(command, "{orchestratorHost}", process.ThisHostname, -1) + command = strings.Replace(command, "{recoveryUID}", topologyRecovery.UID, -1) + + command = strings.Replace(command, "{isSuccessful}", fmt.Sprint(topologyRecovery.SuccessorKey != nil), -1) + if topologyRecovery.SuccessorKey != nil { + command = strings.Replace(command, "{successorHost}", topologyRecovery.SuccessorKey.Hostname, -1) + command = strings.Replace(command, "{successorPort}", fmt.Sprintf("%d", topologyRecovery.SuccessorKey.Port), -1) + // As long as SucesssorKey != nil, we replace {successorAlias}. + // If SucessorAlias is "", it's fine. We'll replace {successorAlias} with "". + command = strings.Replace(command, "{successorAlias}", topologyRecovery.SuccessorAlias, -1) + } + + command = strings.Replace(command, "{lostSlaves}", topologyRecovery.LostReplicas.ToCommaDelimitedList(), -1) + command = strings.Replace(command, "{lostReplicas}", topologyRecovery.LostReplicas.ToCommaDelimitedList(), -1) + command = strings.Replace(command, "{countLostReplicas}", fmt.Sprintf("%d", len(topologyRecovery.LostReplicas)), -1) + command = strings.Replace(command, "{slaveHosts}", analysisEntry.Replicas.ToCommaDelimitedList(), -1) + command = strings.Replace(command, "{replicaHosts}", analysisEntry.Replicas.ToCommaDelimitedList(), -1) + + return command, async +} + +// applyEnvironmentVariables sets the relevant environment variables for a recovery +func applyEnvironmentVariables(topologyRecovery *TopologyRecovery) []string { + analysisEntry := &topologyRecovery.AnalysisEntry + env := goos.Environ() + env = append(env, fmt.Sprintf("ORC_FAILURE_TYPE=%s", string(analysisEntry.Analysis))) + env = append(env, fmt.Sprintf("ORC_INSTANCE_TYPE=%s", string(analysisEntry.GetAnalysisInstanceType()))) + env = append(env, fmt.Sprintf("ORC_IS_MASTER=%t", analysisEntry.IsMaster)) + env = append(env, fmt.Sprintf("ORC_IS_CO_MASTER=%t", analysisEntry.IsCoMaster)) + env = append(env, fmt.Sprintf("ORC_FAILURE_DESCRIPTION=%s", analysisEntry.Description)) + env = append(env, fmt.Sprintf("ORC_COMMAND=%s", analysisEntry.CommandHint)) + env = append(env, fmt.Sprintf("ORC_FAILED_HOST=%s", analysisEntry.AnalyzedInstanceKey.Hostname)) + env = append(env, fmt.Sprintf("ORC_FAILED_PORT=%d", analysisEntry.AnalyzedInstanceKey.Port)) + env = append(env, fmt.Sprintf("ORC_FAILURE_CLUSTER=%s", analysisEntry.ClusterDetails.ClusterName)) + env = append(env, fmt.Sprintf("ORC_FAILURE_CLUSTER_ALIAS=%s", analysisEntry.ClusterDetails.ClusterAlias)) + env = append(env, fmt.Sprintf("ORC_FAILURE_CLUSTER_DOMAIN=%s", analysisEntry.ClusterDetails.ClusterDomain)) + env = append(env, fmt.Sprintf("ORC_COUNT_REPLICAS=%d", analysisEntry.CountReplicas)) + env = append(env, fmt.Sprintf("ORC_IS_DOWNTIMED=%v", analysisEntry.IsDowntimed)) + env = append(env, fmt.Sprintf("ORC_AUTO_MASTER_RECOVERY=%v", analysisEntry.ClusterDetails.HasAutomatedMasterRecovery)) + env = append(env, fmt.Sprintf("ORC_AUTO_INTERMEDIATE_MASTER_RECOVERY=%v", analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery)) + env = append(env, fmt.Sprintf("ORC_ORCHESTRATOR_HOST=%s", process.ThisHostname)) + env = append(env, fmt.Sprintf("ORC_IS_SUCCESSFUL=%v", (topologyRecovery.SuccessorKey != nil))) + env = append(env, fmt.Sprintf("ORC_LOST_REPLICAS=%s", topologyRecovery.LostReplicas.ToCommaDelimitedList())) + env = append(env, fmt.Sprintf("ORC_REPLICA_HOSTS=%s", analysisEntry.Replicas.ToCommaDelimitedList())) + env = append(env, fmt.Sprintf("ORC_RECOVERY_UID=%s", topologyRecovery.UID)) + + if topologyRecovery.SuccessorKey != nil { + env = append(env, fmt.Sprintf("ORC_SUCCESSOR_HOST=%s", topologyRecovery.SuccessorKey.Hostname)) + env = append(env, fmt.Sprintf("ORC_SUCCESSOR_PORT=%d", topologyRecovery.SuccessorKey.Port)) + // As long as SucesssorKey != nil, we replace {successorAlias}. + // If SucessorAlias is "", it's fine. We'll replace {successorAlias} with "". + env = append(env, fmt.Sprintf("ORC_SUCCESSOR_ALIAS=%s", topologyRecovery.SuccessorAlias)) + } + + return env +} + +func executeProcess(command string, env []string, topologyRecovery *TopologyRecovery, fullDescription string) (err error) { + // Log the command to be run and record how long it takes as this may be useful + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Running %s: %s", fullDescription, command)) + start := time.Now() + var info string + if err = os.CommandRun(command, env); err == nil { + info = fmt.Sprintf("Completed %s in %v", fullDescription, time.Since(start)) + } else { + info = fmt.Sprintf("Execution of %s failed in %v with error: %v", fullDescription, time.Since(start), err) + log.Errorf(info) + } + AuditTopologyRecovery(topologyRecovery, info) + return err +} + +// executeProcesses executes a list of processes +func executeProcesses(processes []string, description string, topologyRecovery *TopologyRecovery, failOnError bool) (err error) { + if len(processes) == 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("No %s hooks to run", description)) + return nil + } + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Running %d %s hooks", len(processes), description)) + for i, command := range processes { + command, async := prepareCommand(command, topologyRecovery) + env := applyEnvironmentVariables(topologyRecovery) + + fullDescription := fmt.Sprintf("%s hook %d of %d", description, i+1, len(processes)) + if async { + fullDescription = fmt.Sprintf("%s (async)", fullDescription) + } + if async { + // Ignore errors + go executeProcess(command, env, topologyRecovery, fullDescription) + } else { + if cmdErr := executeProcess(command, env, topologyRecovery, fullDescription); cmdErr != nil { + if failOnError { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Not running further %s hooks", description)) + return cmdErr + } + if err == nil { + // Keep first error encountered + err = cmdErr + } + } + } + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("done running %s hooks", description)) + return err +} + +func recoverDeadMasterInBinlogServerTopology(topologyRecovery *TopologyRecovery) (promotedReplica *inst.Instance, err error) { + failedMasterKey := &topologyRecovery.AnalysisEntry.AnalyzedInstanceKey + + var promotedBinlogServer *inst.Instance + + _, promotedBinlogServer, err = inst.RegroupReplicasBinlogServers(failedMasterKey, true) + if err != nil { + return nil, log.Errore(err) + } + promotedBinlogServer, err = inst.StopReplication(&promotedBinlogServer.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + // Find candidate replica + promotedReplica, err = inst.GetCandidateReplicaOfBinlogServerTopology(&promotedBinlogServer.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + // Align it with binlog server coordinates + promotedReplica, err = inst.StopReplication(&promotedReplica.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedReplica, err = inst.StartReplicationUntilMasterCoordinates(&promotedReplica.Key, &promotedBinlogServer.ExecBinlogCoordinates) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedReplica, err = inst.StopReplication(&promotedReplica.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + // Detach, flush binary logs forward + promotedReplica, err = inst.ResetReplication(&promotedReplica.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedReplica, err = inst.FlushBinaryLogsTo(&promotedReplica.Key, promotedBinlogServer.ExecBinlogCoordinates.LogFile) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedReplica, err = inst.FlushBinaryLogs(&promotedReplica.Key, 1) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedReplica, err = inst.PurgeBinaryLogsToLatest(&promotedReplica.Key, false) + if err != nil { + return promotedReplica, log.Errore(err) + } + // Reconnect binlog servers to promoted replica (now master): + promotedBinlogServer, err = inst.SkipToNextBinaryLog(&promotedBinlogServer.Key) + if err != nil { + return promotedReplica, log.Errore(err) + } + promotedBinlogServer, err = inst.Repoint(&promotedBinlogServer.Key, &promotedReplica.Key, inst.GTIDHintDeny) + if err != nil { + return nil, log.Errore(err) + } + + func() { + // Move binlog server replicas up to replicate from master. + // This can only be done once a BLS has skipped to the next binlog + // We postpone this operation. The master is already promoted and we're happy. + binlogServerReplicas, err := inst.ReadBinlogServerReplicaInstances(&promotedBinlogServer.Key) + if err != nil { + return + } + maxBinlogServersToPromote := 3 + for i, binlogServerReplica := range binlogServerReplicas { + binlogServerReplica := binlogServerReplica + if i >= maxBinlogServersToPromote { + return + } + postponedFunction := func() error { + binlogServerReplica, err := inst.StopReplication(&binlogServerReplica.Key) + if err != nil { + return err + } + // Make sure the BLS has the "next binlog" -- the one the master flushed & purged to. Otherwise the BLS + // will request a binlog the master does not have + if binlogServerReplica.ExecBinlogCoordinates.SmallerThan(&promotedBinlogServer.ExecBinlogCoordinates) { + binlogServerReplica, err = inst.StartReplicationUntilMasterCoordinates(&binlogServerReplica.Key, &promotedBinlogServer.ExecBinlogCoordinates) + if err != nil { + return err + } + } + _, err = inst.Repoint(&binlogServerReplica.Key, &promotedReplica.Key, inst.GTIDHintDeny) + return err + } + topologyRecovery.AddPostponedFunction(postponedFunction, fmt.Sprintf("recoverDeadMasterInBinlogServerTopology, moving binlog server %+v", binlogServerReplica.Key)) + } + }() + + return promotedReplica, err +} + +func GetMasterRecoveryType(analysisEntry *inst.ReplicationAnalysis) (masterRecoveryType MasterRecoveryType) { + masterRecoveryType = MasterRecoveryPseudoGTID + if analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology { + masterRecoveryType = MasterRecoveryGTID + } else if analysisEntry.BinlogServerImmediateTopology { + masterRecoveryType = MasterRecoveryBinlogServer + } + return masterRecoveryType +} + +// recoverDeadMaster recovers a dead master, complete logic inside +func recoverDeadMaster(topologyRecovery *TopologyRecovery, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { + topologyRecovery.Type = MasterRecovery + analysisEntry := &topologyRecovery.AnalysisEntry + failedInstanceKey := &analysisEntry.AnalyzedInstanceKey + var cannotReplicateReplicas [](*inst.Instance) + postponedAll := false + + inst.AuditOperation("recover-dead-master", failedInstanceKey, "problem found; will recover") + if !skipProcesses { + if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { + return false, nil, lostReplicas, topologyRecovery.AddError(err) + } + } + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: will recover %+v", *failedInstanceKey)) + + topologyRecovery.RecoveryType = GetMasterRecoveryType(analysisEntry) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: masterRecoveryType=%+v", topologyRecovery.RecoveryType)) + + promotedReplicaIsIdeal := func(promoted *inst.Instance, hasBestPromotionRule bool) bool { + if promoted == nil { + return false + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: promotedReplicaIsIdeal(%+v)", promoted.Key)) + if candidateInstanceKey != nil { //explicit request to promote a specific server + return promoted.Key.Equals(candidateInstanceKey) + } + if promoted.DataCenter == topologyRecovery.AnalysisEntry.AnalyzedInstanceDataCenter && + promoted.PhysicalEnvironment == topologyRecovery.AnalysisEntry.AnalyzedInstancePhysicalEnvironment { + if promoted.PromotionRule == inst.MustPromoteRule || promoted.PromotionRule == inst.PreferPromoteRule || + (hasBestPromotionRule && promoted.PromotionRule != inst.MustNotPromoteRule) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: found %+v to be ideal candidate; will optimize recovery", promoted.Key)) + postponedAll = true + return true + } + } + return false + } + switch topologyRecovery.RecoveryType { + case MasterRecoveryGTID: + { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: regrouping replicas via GTID")) + lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer, promotedReplicaIsIdeal) + } + case MasterRecoveryPseudoGTID: + { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: regrouping replicas via Pseudo-GTID")) + lostReplicas, _, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer, promotedReplicaIsIdeal) + } + case MasterRecoveryBinlogServer: + { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: recovering via binlog servers")) + promotedReplica, err = recoverDeadMasterInBinlogServerTopology(topologyRecovery) + } + } + topologyRecovery.AddError(err) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + for _, replica := range lostReplicas { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: - lost replica: %+v", replica.Key)) + } + + if promotedReplica != nil && len(lostReplicas) > 0 && config.Config.DetachLostReplicasAfterMasterFailover { + postponedFunction := func() error { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: lost %+v replicas during recovery process; detaching them", len(lostReplicas))) + for _, replica := range lostReplicas { + replica := replica + inst.DetachReplicaMasterHost(&replica.Key) + } + return nil + } + topologyRecovery.AddPostponedFunction(postponedFunction, fmt.Sprintf("RecoverDeadMaster, detach %+v lost replicas", len(lostReplicas))) + } + + func() error { + inst.BeginDowntime(inst.NewDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), inst.DowntimeLostInRecoveryMessage, time.Duration(config.LostInRecoveryDowntimeSeconds)*time.Second)) + acknowledgeInstanceFailureDetection(&analysisEntry.AnalyzedInstanceKey) + for _, replica := range lostReplicas { + replica := replica + inst.BeginDowntime(inst.NewDowntime(&replica.Key, inst.GetMaintenanceOwner(), inst.DowntimeLostInRecoveryMessage, time.Duration(config.LostInRecoveryDowntimeSeconds)*time.Second)) + } + return nil + }() + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) + + if promotedReplica != nil && !postponedAll { + promotedReplica, err = replacePromotedReplicaWithCandidate(topologyRecovery, &analysisEntry.AnalyzedInstanceKey, promotedReplica, candidateInstanceKey) + topologyRecovery.AddError(err) + } + + if promotedReplica == nil { + message := "Failure: no replica promoted." + AuditTopologyRecovery(topologyRecovery, message) + inst.AuditOperation("recover-dead-master", failedInstanceKey, message) + } else { + message := fmt.Sprintf("promoted replica: %+v", promotedReplica.Key) + AuditTopologyRecovery(topologyRecovery, message) + inst.AuditOperation("recover-dead-master", failedInstanceKey, message) + } + return true, promotedReplica, lostReplicas, err +} + +func MasterFailoverGeographicConstraintSatisfied(analysisEntry *inst.ReplicationAnalysis, suggestedInstance *inst.Instance) (satisfied bool, dissatisfiedReason string) { + if config.Config.PreventCrossDataCenterMasterFailover { + if suggestedInstance.DataCenter != analysisEntry.AnalyzedInstanceDataCenter { + return false, fmt.Sprintf("PreventCrossDataCenterMasterFailover: will not promote server in %s when failed server in %s", suggestedInstance.DataCenter, analysisEntry.AnalyzedInstanceDataCenter) + } + } + if config.Config.PreventCrossRegionMasterFailover { + if suggestedInstance.Region != analysisEntry.AnalyzedInstanceRegion { + return false, fmt.Sprintf("PreventCrossRegionMasterFailover: will not promote server in %s when failed server in %s", suggestedInstance.Region, analysisEntry.AnalyzedInstanceRegion) + } + } + return true, "" +} + +// SuggestReplacementForPromotedReplica returns a server to take over the already +// promoted replica, if such server is found and makes an improvement over the promoted replica. +func SuggestReplacementForPromotedReplica(topologyRecovery *TopologyRecovery, deadInstanceKey *inst.InstanceKey, promotedReplica *inst.Instance, candidateInstanceKey *inst.InstanceKey) (replacement *inst.Instance, actionRequired bool, err error) { + candidateReplicas, _ := inst.ReadClusterCandidateInstances(promotedReplica.ClusterName) + candidateReplicas = inst.RemoveInstance(candidateReplicas, deadInstanceKey) + deadInstance, _, err := inst.ReadInstance(deadInstanceKey) + if err != nil { + deadInstance = nil + } + // So we've already promoted a replica. + // However, can we improve on our choice? Are there any replicas marked with "is_candidate"? + // Maybe we actually promoted such a replica. Does that mean we should keep it? + // Maybe we promoted a "neutral", and some "prefer" server is available. + // Maybe we promoted a "prefer_not" + // Maybe we promoted a server in a different DC than the master + // There's many options. We may wish to replace the server we promoted with a better one. + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("checking if should replace promoted replica with a better candidate")) + if candidateInstanceKey == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ checking if promoted replica is the ideal candidate")) + if deadInstance != nil { + for _, candidateReplica := range candidateReplicas { + if promotedReplica.Key.Equals(&candidateReplica.Key) && + promotedReplica.DataCenter == deadInstance.DataCenter && + promotedReplica.PhysicalEnvironment == deadInstance.PhysicalEnvironment { + // Seems like we promoted a candidate in the same DC & ENV as dead IM! Ideal! We're happy! + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("promoted replica %+v is the ideal candidate", promotedReplica.Key)) + return promotedReplica, false, nil + } + } + } + } + // We didn't pick the ideal candidate; let's see if we can replace with a candidate from same DC and ENV + if candidateInstanceKey == nil { + // Try a candidate replica that is in same DC & env as the dead instance + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for an ideal candidate")) + if deadInstance != nil { + for _, candidateReplica := range candidateReplicas { + if canTakeOverPromotedServerAsMaster(candidateReplica, promotedReplica) && + candidateReplica.DataCenter == deadInstance.DataCenter && + candidateReplica.PhysicalEnvironment == deadInstance.PhysicalEnvironment { + // This would make a great candidate + candidateInstanceKey = &candidateReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as failed instance", *deadInstanceKey, candidateReplica.Key)) + } + } + } + } + if candidateInstanceKey == nil { + // We cannot find a candidate in same DC and ENV as dead master + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ checking if promoted replica is an OK candidate")) + for _, candidateReplica := range candidateReplicas { + if promotedReplica.Key.Equals(&candidateReplica.Key) { + // Seems like we promoted a candidate replica (though not in same DC and ENV as dead master) + if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&topologyRecovery.AnalysisEntry, candidateReplica); satisfied { + // Good enough. No further action required. + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("promoted replica %+v is a good candidate", promotedReplica.Key)) + return promotedReplica, false, nil + } else { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("skipping %+v; %s", candidateReplica.Key, reason)) + } + } + } + } + // Still nothing? + if candidateInstanceKey == nil { + // Try a candidate replica that is in same DC & env as the promoted replica (our promoted replica is not an "is_candidate") + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for a candidate")) + for _, candidateReplica := range candidateReplicas { + if canTakeOverPromotedServerAsMaster(candidateReplica, promotedReplica) && + promotedReplica.DataCenter == candidateReplica.DataCenter && + promotedReplica.PhysicalEnvironment == candidateReplica.PhysicalEnvironment { + // OK, better than nothing + candidateInstanceKey = &candidateReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as promoted instance", promotedReplica.Key, candidateReplica.Key)) + } + } + } + // Still nothing? + if candidateInstanceKey == nil { + // Try a candidate replica (our promoted replica is not an "is_candidate") + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for a candidate")) + for _, candidateReplica := range candidateReplicas { + if canTakeOverPromotedServerAsMaster(candidateReplica, promotedReplica) { + if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&topologyRecovery.AnalysisEntry, candidateReplica); satisfied { + // OK, better than nothing + candidateInstanceKey = &candidateReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement", promotedReplica.Key, candidateReplica.Key)) + } else { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("skipping %+v; %s", candidateReplica.Key, reason)) + } + } + } + } + + keepSearchingHint := "" + if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&topologyRecovery.AnalysisEntry, promotedReplica); !satisfied { + keepSearchingHint = fmt.Sprintf("Will keep searching; %s", reason) + } else if promotedReplica.PromotionRule == inst.PreferNotPromoteRule { + keepSearchingHint = fmt.Sprintf("Will keep searching because we have promoted a server with prefer_not rule: %+v", promotedReplica.Key) + } + if keepSearchingHint != "" { + AuditTopologyRecovery(topologyRecovery, keepSearchingHint) + neutralReplicas, _ := inst.ReadClusterNeutralPromotionRuleInstances(promotedReplica.ClusterName) + + if candidateInstanceKey == nil { + // Still nothing? Then we didn't find a replica marked as "candidate". OK, further down the stream we have: + // find neutral instance in same dv&env as dead master + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for a neutral server to replace promoted server, in same DC and env as dead master")) + for _, neutralReplica := range neutralReplicas { + if canTakeOverPromotedServerAsMaster(neutralReplica, promotedReplica) && + deadInstance.DataCenter == neutralReplica.DataCenter && + deadInstance.PhysicalEnvironment == neutralReplica.PhysicalEnvironment { + candidateInstanceKey = &neutralReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as dead master", promotedReplica.Key, neutralReplica.Key)) + } + } + } + if candidateInstanceKey == nil { + // find neutral instance in same dv&env as promoted replica + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for a neutral server to replace promoted server, in same DC and env as promoted replica")) + for _, neutralReplica := range neutralReplicas { + if canTakeOverPromotedServerAsMaster(neutralReplica, promotedReplica) && + promotedReplica.DataCenter == neutralReplica.DataCenter && + promotedReplica.PhysicalEnvironment == neutralReplica.PhysicalEnvironment { + candidateInstanceKey = &neutralReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on being in same DC & env as promoted instance", promotedReplica.Key, neutralReplica.Key)) + } + } + } + if candidateInstanceKey == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ searching for a neutral server to replace a prefer_not")) + for _, neutralReplica := range neutralReplicas { + if canTakeOverPromotedServerAsMaster(neutralReplica, promotedReplica) { + if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&topologyRecovery.AnalysisEntry, neutralReplica); satisfied { + // OK, better than nothing + candidateInstanceKey = &neutralReplica.Key + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("no candidate was offered for %+v but orchestrator picks %+v as candidate replacement, based on promoted instance having prefer_not promotion rule", promotedReplica.Key, neutralReplica.Key)) + } else { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("skipping %+v; %s", neutralReplica.Key, reason)) + } + } + } + } + } + + // So do we have a candidate? + if candidateInstanceKey == nil { + // Found nothing. Stick with promoted replica + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ found no server to promote on top promoted replica")) + return promotedReplica, false, nil + } + if promotedReplica.Key.Equals(candidateInstanceKey) { + // Sanity. It IS the candidate, nothing to promote... + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("+ sanity check: found our very own server to promote; doing nothing")) + return promotedReplica, false, nil + } + replacement, _, err = inst.ReadInstance(candidateInstanceKey) + return replacement, true, err +} + +// replacePromotedReplicaWithCandidate is called after a master (or co-master) +// died and was replaced by some promotedReplica. +// But, is there an even better replica to promote? +// if candidateInstanceKey is given, then it is forced to be promoted over the promotedReplica +// Otherwise, search for the best to promote! +func replacePromotedReplicaWithCandidate(topologyRecovery *TopologyRecovery, deadInstanceKey *inst.InstanceKey, promotedReplica *inst.Instance, candidateInstanceKey *inst.InstanceKey) (*inst.Instance, error) { + candidateInstance, actionRequired, err := SuggestReplacementForPromotedReplica(topologyRecovery, deadInstanceKey, promotedReplica, candidateInstanceKey) + if err != nil { + return promotedReplica, log.Errore(err) + } + if !actionRequired { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replace-promoted-replica-with-candidate: promoted instance %+v requires no further action", promotedReplica.Key)) + return promotedReplica, nil + } + + // Try and promote suggested candidate, if applicable and possible + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replace-promoted-replica-with-candidate: promoted instance %+v is not the suggested candidate %+v. Will see what can be done", promotedReplica.Key, candidateInstance.Key)) + + if candidateInstance.MasterKey.Equals(&promotedReplica.Key) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("replace-promoted-replica-with-candidate: suggested candidate %+v is replica of promoted instance %+v. Will try and take its master", candidateInstance.Key, promotedReplica.Key)) + candidateInstance, err = inst.TakeMaster(&candidateInstance.Key, topologyRecovery.Type == CoMasterRecovery) + if err != nil { + return promotedReplica, log.Errore(err) + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("success promoting %+v over %+v", candidateInstance.Key, promotedReplica.Key)) + + // As followup to taking over, let's relocate all the rest of the replicas under the candidate instance + relocateReplicasFunc := func() error { + log.Debugf("replace-promoted-replica-with-candidate: relocating replicas of %+v below %+v", promotedReplica.Key, candidateInstance.Key) + + relocatedReplicas, _, err, _ := inst.RelocateReplicas(&promotedReplica.Key, &candidateInstance.Key, "") + log.Debugf("replace-promoted-replica-with-candidate: + relocated %+v replicas of %+v below %+v", len(relocatedReplicas), promotedReplica.Key, candidateInstance.Key) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("relocated %+v replicas of %+v below %+v", len(relocatedReplicas), promotedReplica.Key, candidateInstance.Key)) + return log.Errore(err) + } + postponedFunctionsContainer := &topologyRecovery.PostponedFunctionsContainer + if postponedFunctionsContainer != nil { + postponedFunctionsContainer.AddPostponedFunction(relocateReplicasFunc, fmt.Sprintf("replace-promoted-replica-with-candidate: relocate replicas of %+v", promotedReplica.Key)) + } else { + _ = relocateReplicasFunc() + // We do not propagate the error. It is logged, but otherwise should not fail the entire failover operation + } + return candidateInstance, nil + } + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("could not manage to promoted suggested candidate %+v", candidateInstance.Key)) + return promotedReplica, nil +} + +// checkAndRecoverDeadMaster checks a given analysis, decides whether to take action, and possibly takes action +// Returns true when action was taken. +func checkAndRecoverDeadMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { + return false, nil, nil + } + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadMaster.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // That's it! We must do recovery! + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("will handle DeadMaster event on %+v", analysisEntry.ClusterDetails.ClusterName)) + recoverDeadMasterCounter.Inc(1) + recoveryAttempted, promotedReplica, lostReplicas, err := recoverDeadMaster(topologyRecovery, candidateInstanceKey, skipProcesses) + if err != nil { + AuditTopologyRecovery(topologyRecovery, err.Error()) + } + topologyRecovery.LostReplicas.AddInstances(lostReplicas) + if !recoveryAttempted { + return false, topologyRecovery, err + } + + overrideMasterPromotion := func() (*inst.Instance, error) { + if promotedReplica == nil { + // No promotion; nothing to override. + return promotedReplica, err + } + // Scenarios where we might cancel the promotion. + if satisfied, reason := MasterFailoverGeographicConstraintSatisfied(&analysisEntry, promotedReplica); !satisfied { + return nil, fmt.Errorf("RecoverDeadMaster: failed %+v promotion; %s", promotedReplica.Key, reason) + } + if config.Config.FailMasterPromotionOnLagMinutes > 0 && + time.Duration(promotedReplica.ReplicationLagSeconds.Int64)*time.Second >= time.Duration(config.Config.FailMasterPromotionOnLagMinutes)*time.Minute { + // candidate replica lags too much + return nil, fmt.Errorf("RecoverDeadMaster: failed promotion. FailMasterPromotionOnLagMinutes is set to %d (minutes) and promoted replica %+v 's lag is %d (seconds)", config.Config.FailMasterPromotionOnLagMinutes, promotedReplica.Key, promotedReplica.ReplicationLagSeconds.Int64) + } + if config.Config.FailMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() { + return nil, fmt.Errorf("RecoverDeadMaster: failed promotion. FailMasterPromotionIfSQLThreadNotUpToDate is set and promoted replica %+v 's sql thread is not up to date (relay logs still unapplied). Aborting promotion", promotedReplica.Key) + } + if config.Config.DelayMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("DelayMasterPromotionIfSQLThreadNotUpToDate: waiting for SQL thread on %+v", promotedReplica.Key)) + if _, err := inst.WaitForSQLThreadUpToDate(&promotedReplica.Key, 0, 0); err != nil { + return nil, fmt.Errorf("DelayMasterPromotionIfSQLThreadNotUpToDate error: %+v", err) + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("DelayMasterPromotionIfSQLThreadNotUpToDate: SQL thread caught up on %+v", promotedReplica.Key)) + } + // All seems well. No override done. + return promotedReplica, err + } + if promotedReplica, err = overrideMasterPromotion(); err != nil { + AuditTopologyRecovery(topologyRecovery, err.Error()) + } + // And this is the end; whether successful or not, we're done. + resolveRecovery(topologyRecovery, promotedReplica) + // Now, see whether we are successful or not. From this point there's no going back. + if promotedReplica != nil { + // Success! + recoverDeadMasterSuccessCounter.Inc(1) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadMaster: successfully promoted %+v", promotedReplica.Key)) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: promoted server coordinates: %+v", promotedReplica.SelfBinlogCoordinates)) + + if config.Config.ApplyMySQLPromotionAfterMasterFailover || analysisEntry.CommandHint == inst.GracefulMasterTakeoverCommandHint { + // on GracefulMasterTakeoverCommandHint it makes utter sense to RESET SLAVE ALL and read_only=0, and there is no sense in not doing so. + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: will apply MySQL changes to promoted master")) + { + _, err := inst.ResetReplicationOperation(&promotedReplica.Key) + if err != nil { + // Ugly, but this is important. Let's give it another try + _, err = inst.ResetReplicationOperation(&promotedReplica.Key) + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: applying RESET SLAVE ALL on promoted master: success=%t", (err == nil))) + if err != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: NOTE that %+v is promoted even though SHOW SLAVE STATUS may still show it has a master", promotedReplica.Key)) + } + } + { + _, err := inst.SetReadOnly(&promotedReplica.Key, false) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: applying read-only=0 on promoted master: success=%t", (err == nil))) + } + // Let's attempt, though we won't necessarily succeed, to set old master as read-only + go func() { + _, err := inst.SetReadOnly(&analysisEntry.AnalyzedInstanceKey, true) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: applying read-only=1 on demoted master: success=%t", (err == nil))) + }() + } + + kvPairs := inst.GetClusterMasterKVPairs(analysisEntry.ClusterDetails.ClusterAlias, &promotedReplica.Key) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Writing KV %+v", kvPairs)) + if orcraft.IsRaftEnabled() { + for _, kvPair := range kvPairs { + _, err := orcraft.PublishCommand("put-key-value", kvPair) + log.Errore(err) + } + // since we'll be affecting 3rd party tools here, we _prefer_ to mitigate re-applying + // of the put-key-value event upon startup. We _recommend_ a snapshot in the near future. + go orcraft.PublishCommand("async-snapshot", "") + } else { + for _, kvPair := range kvPairs { + err := kv.PutKVPair(kvPair) + log.Errore(err) + } + } + { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Distributing KV %+v", kvPairs)) + err := kv.DistributePairs(kvPairs) + log.Errore(err) + } + if config.Config.MasterFailoverDetachReplicaMasterHost { + postponedFunction := func() error { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: detaching master host on promoted master")) + inst.DetachReplicaMasterHost(&promotedReplica.Key) + return nil + } + topologyRecovery.AddPostponedFunction(postponedFunction, fmt.Sprintf("RecoverDeadMaster, detaching promoted master host %+v", promotedReplica.Key)) + } + func() error { + before := analysisEntry.AnalyzedInstanceKey.StringCode() + after := promotedReplica.Key.StringCode() + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: updating cluster_alias: %v -> %v", before, after)) + //~~~inst.ReplaceClusterName(before, after) + if alias := analysisEntry.ClusterDetails.ClusterAlias; alias != "" { + inst.SetClusterAlias(promotedReplica.Key.StringCode(), alias) + } else { + inst.ReplaceAliasClusterName(before, after) + } + return nil + }() + + attributes.SetGeneralAttribute(analysisEntry.ClusterDetails.ClusterDomain, promotedReplica.Key.StringCode()) + + if !skipProcesses { + // Execute post master-failover processes + executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) + } + } else { + recoverDeadMasterFailureCounter.Inc(1) + } + + return true, topologyRecovery, err +} + +// isGenerallyValidAsCandidateSiblingOfIntermediateMaster sees that basic server configuration and state are valid +func isGenerallyValidAsCandidateSiblingOfIntermediateMaster(sibling *inst.Instance) bool { + if !sibling.LogBinEnabled { + return false + } + if !sibling.LogReplicationUpdatesEnabled { + return false + } + if !sibling.ReplicaRunning() { + return false + } + if !sibling.IsLastCheckValid { + return false + } + return true +} + +// isValidAsCandidateSiblingOfIntermediateMaster checks to see that the given sibling is capable to take over instance's replicas +func isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance *inst.Instance, sibling *inst.Instance) bool { + if sibling.Key.Equals(&intermediateMasterInstance.Key) { + // same instance + return false + } + if !isGenerallyValidAsCandidateSiblingOfIntermediateMaster(sibling) { + return false + } + if inst.IsBannedFromBeingCandidateReplica(sibling) { + return false + } + if sibling.HasReplicationFilters != intermediateMasterInstance.HasReplicationFilters { + return false + } + if sibling.IsBinlogServer() != intermediateMasterInstance.IsBinlogServer() { + // When both are binlog servers, failover is trivial. + // When failed IM is binlog server, its sibling is still valid, but we catually prefer to just repoint the replica up -- simplest! + return false + } + if sibling.ExecBinlogCoordinates.SmallerThan(&intermediateMasterInstance.ExecBinlogCoordinates) { + return false + } + return true +} + +func isGenerallyValidAsWouldBeMaster(replica *inst.Instance, requireLogReplicationUpdates bool) bool { + if !replica.IsLastCheckValid { + // something wrong with this replica right now. We shouldn't hope to be able to promote it + return false + } + if !replica.LogBinEnabled { + return false + } + if requireLogReplicationUpdates && !replica.LogReplicationUpdatesEnabled { + return false + } + if replica.IsBinlogServer() { + return false + } + if inst.IsBannedFromBeingCandidateReplica(replica) { + return false + } + + return true +} + +func canTakeOverPromotedServerAsMaster(wantToTakeOver *inst.Instance, toBeTakenOver *inst.Instance) bool { + if !isGenerallyValidAsWouldBeMaster(wantToTakeOver, true) { + return false + } + if !wantToTakeOver.MasterKey.Equals(&toBeTakenOver.Key) { + return false + } + if canReplicate, _ := toBeTakenOver.CanReplicateFrom(wantToTakeOver); !canReplicate { + return false + } + return true +} + +// GetCandidateSiblingOfIntermediateMaster chooses the best sibling of a dead intermediate master +// to whom the IM's replicas can be moved. +func GetCandidateSiblingOfIntermediateMaster(topologyRecovery *TopologyRecovery, intermediateMasterInstance *inst.Instance) (*inst.Instance, error) { + + siblings, err := inst.ReadReplicaInstances(&intermediateMasterInstance.MasterKey) + if err != nil { + return nil, err + } + if len(siblings) <= 1 { + return nil, log.Errorf("topology_recovery: no siblings found for %+v", intermediateMasterInstance.Key) + } + + sort.Sort(sort.Reverse(InstancesByCountReplicas(siblings))) + + // In the next series of steps we attempt to return a good replacement. + // None of the below attempts is sure to pick a winning server. Perhaps picked server is not enough up-todate -- but + // this has small likelihood in the general case, and, well, it's an attempt. It's a Plan A, but we have Plan B & C if this fails. + + // At first, we try to return an "is_candidate" server in same dc & env + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("searching for the best candidate sibling of dead intermediate master %+v", intermediateMasterInstance.Key)) + for _, sibling := range siblings { + sibling := sibling + if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && + sibling.IsCandidate && + sibling.DataCenter == intermediateMasterInstance.DataCenter && + sibling.PhysicalEnvironment == intermediateMasterInstance.PhysicalEnvironment { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found %+v as the ideal candidate", sibling.Key)) + return sibling, nil + } + } + // No candidate in same DC & env, let's search for a candidate anywhere + for _, sibling := range siblings { + sibling := sibling + if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && sibling.IsCandidate { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found %+v as a replacement for %+v [candidate sibling]", sibling.Key, intermediateMasterInstance.Key)) + return sibling, nil + } + } + // Go for some valid in the same DC & ENV + for _, sibling := range siblings { + sibling := sibling + if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) && + sibling.DataCenter == intermediateMasterInstance.DataCenter && + sibling.PhysicalEnvironment == intermediateMasterInstance.PhysicalEnvironment { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found %+v as a replacement for %+v [same dc & environment]", sibling.Key, intermediateMasterInstance.Key)) + return sibling, nil + } + } + // Just whatever is valid. + for _, sibling := range siblings { + sibling := sibling + if isValidAsCandidateSiblingOfIntermediateMaster(intermediateMasterInstance, sibling) { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found %+v as a replacement for %+v [any sibling]", sibling.Key, intermediateMasterInstance.Key)) + return sibling, nil + } + } + return nil, log.Errorf("topology_recovery: cannot find candidate sibling of %+v", intermediateMasterInstance.Key) +} + +// RecoverDeadIntermediateMaster performs intermediate master recovery; complete logic inside +func RecoverDeadIntermediateMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (successorInstance *inst.Instance, err error) { + topologyRecovery.Type = IntermediateMasterRecovery + analysisEntry := &topologyRecovery.AnalysisEntry + failedInstanceKey := &analysisEntry.AnalyzedInstanceKey + recoveryResolved := false + + inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, "problem found; will recover") + if !skipProcesses { + if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { + return nil, topologyRecovery.AddError(err) + } + } + + intermediateMasterInstance, _, err := inst.ReadInstance(failedInstanceKey) + if err != nil { + return nil, topologyRecovery.AddError(err) + } + // Find possible candidate + candidateSiblingOfIntermediateMaster, _ := GetCandidateSiblingOfIntermediateMaster(topologyRecovery, intermediateMasterInstance) + relocateReplicasToCandidateSibling := func() { + if candidateSiblingOfIntermediateMaster == nil { + return + } + // We have a candidate + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: will attempt a candidate intermediate master: %+v", candidateSiblingOfIntermediateMaster.Key)) + relocatedReplicas, candidateSibling, err, errs := inst.RelocateReplicas(failedInstanceKey, &candidateSiblingOfIntermediateMaster.Key, "") + topologyRecovery.AddErrors(errs) + topologyRecovery.ParticipatingInstanceKeys.AddKey(candidateSiblingOfIntermediateMaster.Key) + + if len(relocatedReplicas) == 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: failed to move any replica to candidate intermediate master (%+v)", candidateSibling.Key)) + return + } + if err != nil || len(errs) > 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: move to candidate intermediate master (%+v) did not complete: err: %+v, errs: %+v", candidateSibling.Key, err, errs)) + return + } + if err == nil { + recoveryResolved = true + successorInstance = candidateSibling + + inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Relocated %d replicas under candidate sibling: %+v; %d errors: %+v", len(relocatedReplicas), candidateSibling.Key, len(errs), errs)) + } + } + // Plan A: find a replacement intermediate master in same Data Center + if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter == intermediateMasterInstance.DataCenter { + relocateReplicasToCandidateSibling() + } + if !recoveryResolved { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: will next attempt regrouping of replicas")) + // Plan B: regroup (we wish to reduce cross-DC replication streams) + lostReplicas, _, _, _, regroupPromotedReplica, regroupError := inst.RegroupReplicas(failedInstanceKey, true, nil, nil) + if regroupError != nil { + topologyRecovery.AddError(regroupError) + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: regroup failed on: %+v", regroupError)) + } + if regroupPromotedReplica != nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: regrouped under %+v, with %d lost replicas", regroupPromotedReplica.Key, len(lostReplicas))) + topologyRecovery.ParticipatingInstanceKeys.AddKey(regroupPromotedReplica.Key) + if len(lostReplicas) == 0 && regroupError == nil { + // Seems like the regroup worked flawlessly. The local replica took over all of its siblings. + // We can consider this host to be the successor. + successorInstance = regroupPromotedReplica + } + } + // Plan C: try replacement intermediate master in other DC... + if candidateSiblingOfIntermediateMaster != nil && candidateSiblingOfIntermediateMaster.DataCenter != intermediateMasterInstance.DataCenter { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: will next attempt relocating to another DC server")) + relocateReplicasToCandidateSibling() + } + } + if !recoveryResolved { + // Do we still have leftovers? some replicas couldn't move? Couldn't regroup? Only left with regroup's resulting leader? + // nothing moved? + // We don't care much if regroup made it or not. We prefer that it made it, in which case we only need to relocate up + // one replica, but the operation is still valid if regroup partially/completely failed. We just promote anything + // not regrouped. + // So, match up all that's left, plan D + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: will next attempt to relocate up from %+v", *failedInstanceKey)) + + relocatedReplicas, masterInstance, err, errs := inst.RelocateReplicas(failedInstanceKey, &analysisEntry.AnalyzedInstanceMasterKey, "") + topologyRecovery.AddErrors(errs) + topologyRecovery.ParticipatingInstanceKeys.AddKey(analysisEntry.AnalyzedInstanceMasterKey) + + if len(relocatedReplicas) > 0 { + recoveryResolved = true + if successorInstance == nil { + // There could have been a local replica taking over its siblings. We'd like to consider that one as successor. + successorInstance = masterInstance + } + inst.AuditOperation("recover-dead-intermediate-master", failedInstanceKey, fmt.Sprintf("Relocated replicas under: %+v %d errors: %+v", successorInstance.Key, len(errs), errs)) + } else { + err = log.Errorf("topology_recovery: RecoverDeadIntermediateMaster failed to match up any replica from %+v", *failedInstanceKey) + topologyRecovery.AddError(err) + } + } + if !recoveryResolved { + successorInstance = nil + } + resolveRecovery(topologyRecovery, successorInstance) + return successorInstance, err +} + +// checkAndRecoverDeadIntermediateMaster checks a given analysis, decides whether to take action, and possibly takes action +// Returns true when action was taken. +func checkAndRecoverDeadIntermediateMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { + if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedIntermediateMasterRecovery) { + return false, nil, nil + } + topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadIntermediateMaster: found an active or recent recovery on %+v. Will not issue another RecoverDeadIntermediateMaster.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // That's it! We must do recovery! + recoverDeadIntermediateMasterCounter.Inc(1) + promotedReplica, err := RecoverDeadIntermediateMaster(topologyRecovery, skipProcesses) + if promotedReplica != nil { + // success + recoverDeadIntermediateMasterSuccessCounter.Inc(1) + + if !skipProcesses { + // Execute post intermediate-master-failover processes + topologyRecovery.SuccessorKey = &promotedReplica.Key + topologyRecovery.SuccessorAlias = promotedReplica.InstanceAlias + executeProcesses(config.Config.PostIntermediateMasterFailoverProcesses, "PostIntermediateMasterFailoverProcesses", topologyRecovery, false) + } + } else { + recoverDeadIntermediateMasterFailureCounter.Inc(1) + } + return true, topologyRecovery, err +} + +// RecoverDeadCoMaster recovers a dead co-master, complete logic inside +func RecoverDeadCoMaster(topologyRecovery *TopologyRecovery, skipProcesses bool) (promotedReplica *inst.Instance, lostReplicas [](*inst.Instance), err error) { + topologyRecovery.Type = CoMasterRecovery + analysisEntry := &topologyRecovery.AnalysisEntry + failedInstanceKey := &analysisEntry.AnalyzedInstanceKey + otherCoMasterKey := &analysisEntry.AnalyzedInstanceMasterKey + otherCoMaster, found, _ := inst.ReadInstance(otherCoMasterKey) + if otherCoMaster == nil || !found { + return nil, lostReplicas, topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not read info for co-master %+v of %+v", *otherCoMasterKey, *failedInstanceKey)) + } + inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "problem found; will recover") + if !skipProcesses { + if err := executeProcesses(config.Config.PreFailoverProcesses, "PreFailoverProcesses", topologyRecovery, true); err != nil { + return nil, lostReplicas, topologyRecovery.AddError(err) + } + } + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadCoMaster: will recover %+v", *failedInstanceKey)) + + var coMasterRecoveryType MasterRecoveryType = MasterRecoveryPseudoGTID + if analysisEntry.OracleGTIDImmediateTopology || analysisEntry.MariaDBGTIDImmediateTopology { + coMasterRecoveryType = MasterRecoveryGTID + } + + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadCoMaster: coMasterRecoveryType=%+v", coMasterRecoveryType)) + + var cannotReplicateReplicas [](*inst.Instance) + switch coMasterRecoveryType { + case MasterRecoveryGTID: + { + lostReplicas, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasGTID(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer, nil) + } + case MasterRecoveryPseudoGTID: + { + lostReplicas, _, _, cannotReplicateReplicas, promotedReplica, err = inst.RegroupReplicasPseudoGTIDIncludingSubReplicasOfBinlogServers(failedInstanceKey, true, nil, &topologyRecovery.PostponedFunctionsContainer, nil) + } + } + topologyRecovery.AddError(err) + lostReplicas = append(lostReplicas, cannotReplicateReplicas...) + + mustPromoteOtherCoMaster := config.Config.CoMasterRecoveryMustPromoteOtherCoMaster + if !otherCoMaster.ReadOnly { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadCoMaster: other co-master %+v is writeable hence has to be promoted", otherCoMaster.Key)) + mustPromoteOtherCoMaster = true + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadCoMaster: mustPromoteOtherCoMaster? %+v", mustPromoteOtherCoMaster)) + + if promotedReplica != nil { + topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedReplica.Key) + if mustPromoteOtherCoMaster { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("RecoverDeadCoMaster: mustPromoteOtherCoMaster. Verifying that %+v is/can be promoted", *otherCoMasterKey)) + promotedReplica, err = replacePromotedReplicaWithCandidate(topologyRecovery, failedInstanceKey, promotedReplica, otherCoMasterKey) + } else { + // We are allowed to promote any server + promotedReplica, err = replacePromotedReplicaWithCandidate(topologyRecovery, failedInstanceKey, promotedReplica, nil) + } + topologyRecovery.AddError(err) + } + if promotedReplica != nil { + if mustPromoteOtherCoMaster && !promotedReplica.Key.Equals(otherCoMasterKey) { + topologyRecovery.AddError(log.Errorf("RecoverDeadCoMaster: could not manage to promote other-co-master %+v; was only able to promote %+v; mustPromoteOtherCoMaster is true (either CoMasterRecoveryMustPromoteOtherCoMaster is true, or co-master is writeable), therefore failing", *otherCoMasterKey, promotedReplica.Key)) + promotedReplica = nil + } + } + if promotedReplica != nil { + if config.Config.DelayMasterPromotionIfSQLThreadNotUpToDate { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Waiting to ensure the SQL thread catches up on %+v", promotedReplica.Key)) + if _, err := inst.WaitForSQLThreadUpToDate(&promotedReplica.Key, 0, 0); err != nil { + return promotedReplica, lostReplicas, err + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("SQL thread caught up on %+v", promotedReplica.Key)) + } + topologyRecovery.ParticipatingInstanceKeys.AddKey(promotedReplica.Key) + } + + // OK, we may have someone promoted. Either this was the other co-master or another replica. + // Noting down that we DO NOT attempt to set a new co-master topology. We are good with remaining with a single master. + // I tried solving the "let's promote a replica and create a new co-master setup" but this turns so complex due to various factors. + // I see this as risky and not worth the questionable benefit. + // Maybe future me is a smarter person and finds a simple solution. Unlikely. I'm getting dumber. + // + // ... + // Now that we're convinved, take a look at what we can be left with: + // Say we started with M1<->M2<-S1, with M2 failing, and we promoted S1. + // We now have M1->S1 (because S1 is promoted), S1->M2 (because that's what it remembers), M2->M1 (because that's what it remembers) + // !! This is an evil 3-node circle that must be broken. + // config.Config.ApplyMySQLPromotionAfterMasterFailover, if true, will cause it to break, because we would RESET SLAVE on S1 + // but we want to make sure the circle is broken no matter what. + // So in the case we promoted not-the-other-co-master, we issue a detach-replica-master-host, which is a reversible operation + if promotedReplica != nil && !promotedReplica.Key.Equals(otherCoMasterKey) { + _, err = inst.DetachReplicaMasterHost(&promotedReplica.Key) + topologyRecovery.AddError(log.Errore(err)) + } + + if promotedReplica != nil && len(lostReplicas) > 0 && config.Config.DetachLostReplicasAfterMasterFailover { + postponedFunction := func() error { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadCoMaster: lost %+v replicas during recovery process; detaching them", len(lostReplicas))) + for _, replica := range lostReplicas { + replica := replica + inst.DetachReplicaMasterHost(&replica.Key) + } + return nil + } + topologyRecovery.AddPostponedFunction(postponedFunction, fmt.Sprintf("RecoverDeadCoMaster, detaching %+v replicas", len(lostReplicas))) + } + + func() error { + inst.BeginDowntime(inst.NewDowntime(failedInstanceKey, inst.GetMaintenanceOwner(), inst.DowntimeLostInRecoveryMessage, time.Duration(config.LostInRecoveryDowntimeSeconds)*time.Second)) + acknowledgeInstanceFailureDetection(&analysisEntry.AnalyzedInstanceKey) + for _, replica := range lostReplicas { + replica := replica + inst.BeginDowntime(inst.NewDowntime(&replica.Key, inst.GetMaintenanceOwner(), inst.DowntimeLostInRecoveryMessage, time.Duration(config.LostInRecoveryDowntimeSeconds)*time.Second)) + } + return nil + }() + + return promotedReplica, lostReplicas, err +} + +// checkAndRecoverDeadCoMaster checks a given analysis, decides whether to take action, and possibly takes action +// Returns true when action was taken. +func checkAndRecoverDeadCoMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { + failedInstanceKey := &analysisEntry.AnalyzedInstanceKey + if !(forceInstanceRecovery || analysisEntry.ClusterDetails.HasAutomatedMasterRecovery) { + return false, nil, nil + } + topologyRecovery, err := AttemptRecoveryRegistration(&analysisEntry, !forceInstanceRecovery, !forceInstanceRecovery) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverDeadCoMaster.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + // That's it! We must do recovery! + recoverDeadCoMasterCounter.Inc(1) + promotedReplica, lostReplicas, err := RecoverDeadCoMaster(topologyRecovery, skipProcesses) + resolveRecovery(topologyRecovery, promotedReplica) + if promotedReplica == nil { + inst.AuditOperation("recover-dead-co-master", failedInstanceKey, "Failure: no replica promoted.") + } else { + inst.AuditOperation("recover-dead-co-master", failedInstanceKey, fmt.Sprintf("promoted: %+v", promotedReplica.Key)) + } + topologyRecovery.LostReplicas.AddInstances(lostReplicas) + if promotedReplica != nil { + if config.Config.FailMasterPromotionIfSQLThreadNotUpToDate && !promotedReplica.SQLThreadUpToDate() { + return false, nil, log.Errorf("Promoted replica %+v: sql thread is not up to date (relay logs still unapplied). Aborting promotion", promotedReplica.Key) + } + // success + recoverDeadCoMasterSuccessCounter.Inc(1) + + if config.Config.ApplyMySQLPromotionAfterMasterFailover { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("- RecoverDeadMaster: will apply MySQL changes to promoted master")) + inst.SetReadOnly(&promotedReplica.Key, false) + } + if !skipProcesses { + // Execute post intermediate-master-failover processes + topologyRecovery.SuccessorKey = &promotedReplica.Key + topologyRecovery.SuccessorAlias = promotedReplica.InstanceAlias + executeProcesses(config.Config.PostMasterFailoverProcesses, "PostMasterFailoverProcesses", topologyRecovery, false) + } + } else { + recoverDeadCoMasterFailureCounter.Inc(1) + } + return true, topologyRecovery, err +} + +// checkAndRecoverGenericProblem is a general-purpose recovery function +func checkAndRecoverLockedSemiSyncMaster(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + + topologyRecovery, err = AttemptRecoveryRegistration(&analysisEntry, true, true) + if topologyRecovery == nil { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("found an active or recent recovery on %+v. Will not issue another RecoverLockedSemiSyncMaster.", analysisEntry.AnalyzedInstanceKey)) + return false, nil, err + } + + return false, nil, nil +} + +// checkAndRecoverGenericProblem is a general-purpose recovery function +func checkAndRecoverGenericProblem(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (bool, *TopologyRecovery, error) { + return false, nil, nil +} + +// Force a re-read of a topology instance; this is done because we need to substantiate a suspicion +// that we may have a failover scenario. we want to speed up reading the complete picture. +func emergentlyReadTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) (instance *inst.Instance, err error) { + if existsInCacheError := emergencyReadTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { + // Just recently attempted + return nil, nil + } + instance, err = inst.ReadTopologyInstance(instanceKey) + inst.AuditOperation("emergently-read-topology-instance", instanceKey, string(analysisCode)) + return instance, err +} + +// Force reading of replicas of given instance. This is because we suspect the instance is dead, and want to speed up +// detection of replication failure from its replicas. +func emergentlyReadTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { + replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) + if err != nil { + return + } + for _, replica := range replicas { + go emergentlyReadTopologyInstance(&replica.Key, analysisCode) + } +} + +// emergentlyRestartReplicationOnTopologyInstance forces a RestartReplication on a given instance. +func emergentlyRestartReplicationOnTopologyInstance(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { + // Just recently attempted on this specific replica + return + } + go inst.ExecuteOnTopology(func() { + inst.RestartReplicationQuick(instanceKey) + inst.AuditOperation("emergently-restart-replication-topology-instance", instanceKey, string(analysisCode)) + }) +} + +func beginEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) { + emergencyOperationGracefulPeriodMap.Set(instanceKey.StringCode(), true, cache.DefaultExpiration) +} + +func isInEmergencyOperationGracefulPeriod(instanceKey *inst.InstanceKey) bool { + _, found := emergencyOperationGracefulPeriodMap.Get(instanceKey.StringCode()) + return found +} + +// emergentlyRestartReplicationOnTopologyInstanceReplicas forces a stop slave + start slave on +// replicas of a given instance, in an attempt to cause them to re-evaluate their replication state. +// This can be useful in scenarios where the master has Too Many Connections, but long-time connected +// replicas are not seeing this; when they stop+start replication, they need to re-authenticate and +// that's where we hope they realize the master is bad. +func emergentlyRestartReplicationOnTopologyInstanceReplicas(instanceKey *inst.InstanceKey, analysisCode inst.AnalysisCode) { + if existsInCacheError := emergencyRestartReplicaTopologyInstanceMap.Add(instanceKey.StringCode(), true, cache.DefaultExpiration); existsInCacheError != nil { + // While each replica's RestartReplication() is throttled on its own, it's also wasteful to + // iterate all replicas all the time. This is the reason why we do grand-throttle check. + return + } + beginEmergencyOperationGracefulPeriod(instanceKey) + + replicas, err := inst.ReadReplicaInstancesIncludingBinlogServerSubReplicas(instanceKey) + if err != nil { + return + } + for _, replica := range replicas { + replicaKey := &replica.Key + go emergentlyRestartReplicationOnTopologyInstance(replicaKey, analysisCode) + } +} + +func emergentlyRecordStaleBinlogCoordinates(instanceKey *inst.InstanceKey, binlogCoordinates *inst.BinlogCoordinates) { + err := inst.RecordStaleInstanceBinlogCoordinates(instanceKey, binlogCoordinates) + log.Errore(err) +} + +// checkAndExecuteFailureDetectionProcesses tries to register for failure detection and potentially executes +// failure-detection processes. +func checkAndExecuteFailureDetectionProcesses(analysisEntry inst.ReplicationAnalysis, skipProcesses bool) (detectionRegistrationSuccess bool, processesExecutionAttempted bool, err error) { + if ok, _ := AttemptFailureDetectionRegistration(&analysisEntry); !ok { + if util.ClearToLog("checkAndExecuteFailureDetectionProcesses", analysisEntry.AnalyzedInstanceKey.StringCode()) { + log.Infof("checkAndExecuteFailureDetectionProcesses: could not register %+v detection on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + } + return false, false, nil + } + log.Infof("topology_recovery: detected %+v failure on %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + // Execute on-detection processes + if skipProcesses { + return true, false, nil + } + err = executeProcesses(config.Config.OnFailureDetectionProcesses, "OnFailureDetectionProcesses", NewTopologyRecovery(analysisEntry), true) + return true, true, err +} + +func getCheckAndRecoverFunction(analysisCode inst.AnalysisCode, analyzedInstanceKey *inst.InstanceKey) ( + checkAndRecoverFunction func(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error), + isActionableRecovery bool, +) { + switch analysisCode { + // master + case inst.DeadMaster, inst.DeadMasterAndSomeReplicas: + if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { + return checkAndRecoverGenericProblem, false + } else { + return checkAndRecoverDeadMaster, true + } + case inst.LockedSemiSyncMaster: + if isInEmergencyOperationGracefulPeriod(analyzedInstanceKey) { + return checkAndRecoverGenericProblem, false + } else { + return checkAndRecoverLockedSemiSyncMaster, true + } + // intermediate master + case inst.DeadIntermediateMaster: + return checkAndRecoverDeadIntermediateMaster, true + case inst.DeadIntermediateMasterAndSomeReplicas: + return checkAndRecoverDeadIntermediateMaster, true + case inst.DeadIntermediateMasterWithSingleReplicaFailingToConnect: + return checkAndRecoverDeadIntermediateMaster, true + case inst.AllIntermediateMasterReplicasFailingToConnectOrDead: + return checkAndRecoverDeadIntermediateMaster, true + case inst.DeadIntermediateMasterAndReplicas: + return checkAndRecoverGenericProblem, false + // co-master + case inst.DeadCoMaster: + return checkAndRecoverDeadCoMaster, true + case inst.DeadCoMasterAndSomeReplicas: + return checkAndRecoverDeadCoMaster, true + // master, non actionable + case inst.DeadMasterAndReplicas: + return checkAndRecoverGenericProblem, false + case inst.UnreachableMaster: + return checkAndRecoverGenericProblem, false + case inst.UnreachableMasterWithLaggingReplicas: + return checkAndRecoverGenericProblem, false + case inst.AllMasterReplicasNotReplicating: + return checkAndRecoverGenericProblem, false + case inst.AllMasterReplicasNotReplicatingOrDead: + return checkAndRecoverGenericProblem, false + case inst.UnreachableIntermediateMasterWithLaggingReplicas: + return checkAndRecoverGenericProblem, false + } + // Right now this is mostly causing noise with no clear action. + // Will revisit this in the future. + // case inst.AllMasterReplicasStale: + // return checkAndRecoverGenericProblem, false + + return nil, false +} + +func runEmergentOperations(analysisEntry *inst.ReplicationAnalysis) { + switch analysisEntry.Analysis { + case inst.DeadMasterAndReplicas: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceMasterKey, analysisEntry.Analysis) + case inst.UnreachableMaster: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyReadTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.UnreachableMasterWithLaggingReplicas: + go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.LockedSemiSyncMasterHypothesis: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + go emergentlyRecordStaleBinlogCoordinates(&analysisEntry.AnalyzedInstanceKey, &analysisEntry.AnalyzedInstanceBinlogCoordinates) + case inst.UnreachableIntermediateMasterWithLaggingReplicas: + go emergentlyRestartReplicationOnTopologyInstanceReplicas(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.AllMasterReplicasNotReplicating: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.AllMasterReplicasNotReplicatingOrDead: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceKey, analysisEntry.Analysis) + case inst.FirstTierReplicaFailingToConnectToMaster: + go emergentlyReadTopologyInstance(&analysisEntry.AnalyzedInstanceMasterKey, analysisEntry.Analysis) + } +} + +// executeCheckAndRecoverFunction will choose the correct check & recovery function based on analysis. +// It executes the function synchronuously +func executeCheckAndRecoverFunction(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, forceInstanceRecovery bool, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + atomic.AddInt64(&countPendingRecoveries, 1) + defer atomic.AddInt64(&countPendingRecoveries, -1) + + checkAndRecoverFunction, isActionableRecovery := getCheckAndRecoverFunction(analysisEntry.Analysis, &analysisEntry.AnalyzedInstanceKey) + analysisEntry.IsActionableRecovery = isActionableRecovery + runEmergentOperations(&analysisEntry) + + if checkAndRecoverFunction == nil { + // Unhandled problem type + if analysisEntry.Analysis != inst.NoProblem { + if util.ClearToLog("executeCheckAndRecoverFunction", analysisEntry.AnalyzedInstanceKey.StringCode()) { + log.Warningf("executeCheckAndRecoverFunction: ignoring analysisEntry that has no action plan: %+v; key: %+v", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey) + } + } + + return false, nil, nil + } + // we have a recovery function; its execution still depends on filters if not disabled. + if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: detection", analysisEntry.AnalyzedInstanceKey.StringCode()) { + log.Infof("executeCheckAndRecoverFunction: proceeding with %+v detection on %+v; isActionable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) + } + + // At this point we have validated there's a failure scenario for which we have a recovery path. + + if orcraft.IsRaftEnabled() { + // with raft, all nodes can (and should) run analysis, + // but only the leader proceeds to execute detection hooks and then to failover. + if !orcraft.IsLeader() { + log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ + "skipProcesses: %v: NOT detecting/recovering host (raft non-leader)", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) + return false, nil, err + } + } + + // Initiate detection: + registrationSuccess, _, err := checkAndExecuteFailureDetectionProcesses(analysisEntry, skipProcesses) + if registrationSuccess { + if orcraft.IsRaftEnabled() { + _, err := orcraft.PublishCommand("register-failure-detection", analysisEntry) + log.Errore(err) + } + } + if err != nil { + log.Errorf("executeCheckAndRecoverFunction: error on failure detection: %+v", err) + return false, nil, err + } + // We don't mind whether detection really executed the processes or not + // (it may have been silenced due to previous detection). We only care there's no error. + + // We're about to embark on recovery shortly... + + // Check for recovery being disabled globally + if recoveryDisabledGlobally, err := IsRecoveryDisabled(); err != nil { + // Unexpected. Shouldn't get this + log.Errorf("Unable to determine if recovery is disabled globally: %v", err) + } else if recoveryDisabledGlobally { + if !forceInstanceRecovery { + log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ + "skipProcesses: %v: NOT Recovering host (disabled globally)", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) + + return false, nil, err + } + log.Infof("CheckAndRecover: Analysis: %+v, InstanceKey: %+v, candidateInstanceKey: %+v, "+ + "skipProcesses: %v: recoveries disabled globally but forcing this recovery", + analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, candidateInstanceKey, skipProcesses) + } + + // Actually attempt recovery: + if isActionableRecovery || util.ClearToLog("executeCheckAndRecoverFunction: recovery", analysisEntry.AnalyzedInstanceKey.StringCode()) { + log.Infof("executeCheckAndRecoverFunction: proceeding with %+v recovery on %+v; isRecoverable?: %+v; skipProcesses: %+v", analysisEntry.Analysis, analysisEntry.AnalyzedInstanceKey, isActionableRecovery, skipProcesses) + } + recoveryAttempted, topologyRecovery, err = checkAndRecoverFunction(analysisEntry, candidateInstanceKey, forceInstanceRecovery, skipProcesses) + if !recoveryAttempted { + return recoveryAttempted, topologyRecovery, err + } + if topologyRecovery == nil { + return recoveryAttempted, topologyRecovery, err + } + if b, err := json.Marshal(topologyRecovery); err == nil { + log.Infof("Topology recovery: %+v", string(b)) + } else { + log.Infof("Topology recovery: %+v", *topologyRecovery) + } + if !skipProcesses { + if topologyRecovery.SuccessorKey == nil { + // Execute general unsuccessful post failover processes + executeProcesses(config.Config.PostUnsuccessfulFailoverProcesses, "PostUnsuccessfulFailoverProcesses", topologyRecovery, false) + } else { + // Execute general post failover processes + inst.EndDowntime(topologyRecovery.SuccessorKey) + executeProcesses(config.Config.PostFailoverProcesses, "PostFailoverProcesses", topologyRecovery, false) + } + } + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Waiting for %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) + topologyRecovery.Wait() + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Executed %d postponed functions", topologyRecovery.PostponedFunctionsContainer.Len())) + if topologyRecovery.PostponedFunctionsContainer.Len() > 0 { + AuditTopologyRecovery(topologyRecovery, fmt.Sprintf("Executed postponed functions: %+v", strings.Join(topologyRecovery.PostponedFunctionsContainer.Descriptions(), ", "))) + } + return recoveryAttempted, topologyRecovery, err +} + +// CheckAndRecover is the main entry point for the recovery mechanism +func CheckAndRecover(specificInstance *inst.InstanceKey, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, promotedReplicaKey *inst.InstanceKey, err error) { + // Allow the analysis to run even if we don't want to recover + replicationAnalysis, err := inst.GetReplicationAnalysis("", &inst.ReplicationAnalysisHints{IncludeDowntimed: true, AuditAnalysis: true}) + if err != nil { + return false, nil, log.Errore(err) + } + if *config.RuntimeCLIFlags.Noop { + log.Infof("--noop provided; will not execute processes") + skipProcesses = true + } + // intentionally iterating entries in random order + for _, j := range rand.Perm(len(replicationAnalysis)) { + analysisEntry := replicationAnalysis[j] + if specificInstance != nil { + // We are looking for a specific instance; if this is not the one, skip! + if !specificInstance.Equals(&analysisEntry.AnalyzedInstanceKey) { + continue + } + } + if analysisEntry.SkippableDueToDowntime && specificInstance == nil { + // Only recover a downtimed server if explicitly requested + continue + } + + if specificInstance != nil { + // force mode. Keep it synchronuous + var topologyRecovery *TopologyRecovery + recoveryAttempted, topologyRecovery, err = executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) + log.Errore(err) + if topologyRecovery != nil { + promotedReplicaKey = topologyRecovery.SuccessorKey + } + } else { + go func() { + _, _, err := executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, false, skipProcesses) + log.Errore(err) + }() + } + } + return recoveryAttempted, promotedReplicaKey, err +} + +func forceAnalysisEntry(clusterName string, analysisCode inst.AnalysisCode, commandHint string, failedInstanceKey *inst.InstanceKey) (analysisEntry inst.ReplicationAnalysis, err error) { + clusterInfo, err := inst.ReadClusterInfo(clusterName) + if err != nil { + return analysisEntry, err + } + + clusterAnalysisEntries, err := inst.GetReplicationAnalysis(clusterInfo.ClusterName, &inst.ReplicationAnalysisHints{IncludeDowntimed: true, IncludeNoProblem: true}) + if err != nil { + return analysisEntry, err + } + + for _, entry := range clusterAnalysisEntries { + if entry.AnalyzedInstanceKey.Equals(failedInstanceKey) { + analysisEntry = entry + } + } + analysisEntry.Analysis = analysisCode // we force this analysis + analysisEntry.CommandHint = commandHint + analysisEntry.ClusterDetails = *clusterInfo + analysisEntry.AnalyzedInstanceKey = *failedInstanceKey + + return analysisEntry, nil +} + +// ForceExecuteRecovery can be called to issue a recovery process even if analysis says there is no recovery case. +// The caller of this function injects the type of analysis it wishes the function to assume. +// By calling this function one takes responsibility for one's actions. +func ForceExecuteRecovery(analysisEntry inst.ReplicationAnalysis, candidateInstanceKey *inst.InstanceKey, skipProcesses bool) (recoveryAttempted bool, topologyRecovery *TopologyRecovery, err error) { + return executeCheckAndRecoverFunction(analysisEntry, candidateInstanceKey, true, skipProcesses) +} + +// ForceMasterFailover *trusts* master of given cluster is dead and initiates a failover +func ForceMasterFailover(clusterName string) (topologyRecovery *TopologyRecovery, err error) { + clusterMasters, err := inst.ReadClusterMaster(clusterName) + if err != nil { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + if len(clusterMasters) != 1 { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + clusterMaster := clusterMasters[0] + + analysisEntry, err := forceAnalysisEntry(clusterName, inst.DeadMaster, inst.ForceMasterFailoverCommandHint, &clusterMaster.Key) + if err != nil { + return nil, err + } + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, nil, false) + if err != nil { + return nil, err + } + if !recoveryAttempted { + return nil, fmt.Errorf("Unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, fmt.Errorf("Recovery attempted but with no results. This should not happen") + } + if topologyRecovery.SuccessorKey == nil { + return nil, fmt.Errorf("Recovery attempted yet no replica promoted") + } + return topologyRecovery, nil +} + +// ForceMasterTakeover *trusts* master of given cluster is dead and fails over to designated instance, +// which has to be its direct child. +func ForceMasterTakeover(clusterName string, destination *inst.Instance) (topologyRecovery *TopologyRecovery, err error) { + clusterMasters, err := inst.ReadClusterWriteableMaster(clusterName) + if err != nil { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + if len(clusterMasters) != 1 { + return nil, fmt.Errorf("Cannot deduce cluster master for %+v", clusterName) + } + clusterMaster := clusterMasters[0] + + if !destination.MasterKey.Equals(&clusterMaster.Key) { + return nil, fmt.Errorf("You may only promote a direct child of the master %+v. The master of %+v is %+v.", clusterMaster.Key, destination.Key, destination.MasterKey) + } + log.Infof("Will demote %+v and promote %+v instead", clusterMaster.Key, destination.Key) + + analysisEntry, err := forceAnalysisEntry(clusterName, inst.DeadMaster, inst.ForceMasterTakeoverCommandHint, &clusterMaster.Key) + if err != nil { + return nil, err + } + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &destination.Key, false) + if err != nil { + return nil, err + } + if !recoveryAttempted { + return nil, fmt.Errorf("Unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, fmt.Errorf("Recovery attempted but with no results. This should not happen") + } + if topologyRecovery.SuccessorKey == nil { + return nil, fmt.Errorf("Recovery attempted yet no replica promoted") + } + return topologyRecovery, nil +} + +func getGracefulMasterTakeoverDesignatedInstance(clusterMasterKey *inst.InstanceKey, designatedKey *inst.InstanceKey, clusterMasterDirectReplicas [](*inst.Instance), auto bool) (designatedInstance *inst.Instance, err error) { + if designatedKey == nil { + // User did not specify a replica to promote + if len(clusterMasterDirectReplicas) == 1 { + // Single replica. That's the one we'll promote + return clusterMasterDirectReplicas[0], nil + } + // More than one replica. + if !auto { + return nil, fmt.Errorf("GracefulMasterTakeover: target instance not indicated, auto=false, and master %+v has %+v replicas. orchestrator cannot choose where to failover to. Aborting", *clusterMasterKey, len(clusterMasterDirectReplicas)) + } + log.Debugf("GracefulMasterTakeover: request takeover for master %+v, no designated replica indicated. orchestrator will attempt to auto deduce replica.", *clusterMasterKey) + designatedInstance, _, _, _, _, err = inst.GetCandidateReplica(clusterMasterKey, false) + if err != nil || designatedInstance == nil { + return nil, fmt.Errorf("GracefulMasterTakeover: no target instance indicated, failed to auto-detect candidate replica for master %+v. Aborting", *clusterMasterKey) + } + log.Debugf("GracefulMasterTakeover: candidateReplica=%+v", designatedInstance.Key) + if _, err := inst.StartReplication(&designatedInstance.Key); err != nil { + return nil, fmt.Errorf("GracefulMasterTakeover:cannot start replication on designated replica %+v. Aborting", designatedKey) + } + log.Infof("GracefulMasterTakeover: designated master deduced to be %+v", designatedInstance.Key) + return designatedInstance, nil + } + + // Verify designated instance is a direct replica of master + for _, directReplica := range clusterMasterDirectReplicas { + if directReplica.Key.Equals(designatedKey) { + designatedInstance = directReplica + } + } + if designatedInstance == nil { + return nil, fmt.Errorf("GracefulMasterTakeover: indicated designated instance %+v must be directly replicating from the master %+v", *designatedKey, *clusterMasterKey) + } + log.Infof("GracefulMasterTakeover: designated master instructed to be %+v", designatedInstance.Key) + return designatedInstance, nil +} + +// GracefulMasterTakeover will demote master of existing topology and promote its +// direct replica instead. +// It expects that replica to have no siblings. +// This function is graceful in that it will first lock down the master, then wait +// for the designated replica to catch up with last position. +// It will point old master at the newly promoted master at the correct coordinates, but will not start replication. +func GracefulMasterTakeover(clusterName string, designatedKey *inst.InstanceKey, auto bool) (topologyRecovery *TopologyRecovery, promotedMasterCoordinates *inst.BinlogCoordinates, err error) { + clusterMasters, err := inst.ReadClusterMaster(clusterName) + if err != nil { + return nil, nil, fmt.Errorf("Cannot deduce cluster master for %+v; error: %+v", clusterName, err) + } + if len(clusterMasters) != 1 { + return nil, nil, fmt.Errorf("Cannot deduce cluster master for %+v. Found %+v potential masters", clusterName, len(clusterMasters)) + } + clusterMaster := clusterMasters[0] + + clusterMasterDirectReplicas, err := inst.ReadReplicaInstances(&clusterMaster.Key) + if err != nil { + return nil, nil, log.Errore(err) + } + + if len(clusterMasterDirectReplicas) == 0 { + return nil, nil, fmt.Errorf("Master %+v doesn't seem to have replicas", clusterMaster.Key) + } + + if designatedKey != nil && !designatedKey.IsValid() { + // An empty or invalid key is as good as no key + designatedKey = nil + } + designatedInstance, err := getGracefulMasterTakeoverDesignatedInstance(&clusterMaster.Key, designatedKey, clusterMasterDirectReplicas, auto) + if err != nil { + return nil, nil, log.Errore(err) + } + + if inst.IsBannedFromBeingCandidateReplica(designatedInstance) { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: designated instance %+v cannot be promoted due to promotion rule or it is explicitly ignored in PromotionIgnoreHostnameFilters configuration", designatedInstance.Key) + } + + masterOfDesignatedInstance, err := inst.GetInstanceMaster(designatedInstance) + if err != nil { + return nil, nil, err + } + if !masterOfDesignatedInstance.Key.Equals(&clusterMaster.Key) { + return nil, nil, fmt.Errorf("Sanity check failure. It seems like the designated instance %+v does not replicate from the master %+v (designated instance's master key is %+v). This error is strange. Panicking", designatedInstance.Key, clusterMaster.Key, designatedInstance.MasterKey) + } + if !designatedInstance.HasReasonableMaintenanceReplicationLag() { + return nil, nil, fmt.Errorf("Desginated instance %+v seems to be lagging to much for thie operation. Aborting.", designatedInstance.Key) + } + + if len(clusterMasterDirectReplicas) > 1 { + log.Infof("GracefulMasterTakeover: Will let %+v take over its siblings", designatedInstance.Key) + relocatedReplicas, _, err, _ := inst.RelocateReplicas(&clusterMaster.Key, &designatedInstance.Key, "") + if len(relocatedReplicas) != len(clusterMasterDirectReplicas)-1 { + // We are unable to make designated instance master of all its siblings + relocatedReplicasKeyMap := inst.NewInstanceKeyMap() + relocatedReplicasKeyMap.AddInstances(relocatedReplicas) + // Let's see which replicas have not been relocated + for _, directReplica := range clusterMasterDirectReplicas { + if relocatedReplicasKeyMap.HasKey(directReplica.Key) { + // relocated, good + continue + } + if directReplica.Key.Equals(&designatedInstance.Key) { + // obviously we skip this one + continue + } + if directReplica.IsDowntimed { + // obviously we skip this one + log.Warningf("GracefulMasterTakeover: unable to relocate %+v below designated %+v, but since it is downtimed (downtime reason: %s) I will proceed", directReplica.Key, designatedInstance.Key, directReplica.DowntimeReason) + continue + } + return nil, nil, fmt.Errorf("Desginated instance %+v cannot take over all of its siblings. Error: %+v", designatedInstance.Key, err) + } + } + } + log.Infof("GracefulMasterTakeover: Will demote %+v and promote %+v instead", clusterMaster.Key, designatedInstance.Key) + + replicationUser, replicationPassword, replicationCredentialsError := inst.ReadReplicationCredentials(&designatedInstance.Key) + + analysisEntry, err := forceAnalysisEntry(clusterName, inst.DeadMaster, inst.GracefulMasterTakeoverCommandHint, &clusterMaster.Key) + if err != nil { + return nil, nil, err + } + preGracefulTakeoverTopologyRecovery := &TopologyRecovery{ + SuccessorKey: &designatedInstance.Key, + AnalysisEntry: analysisEntry, + } + if err := executeProcesses(config.Config.PreGracefulTakeoverProcesses, "PreGracefulTakeoverProcesses", preGracefulTakeoverTopologyRecovery, true); err != nil { + return nil, nil, fmt.Errorf("Failed running PreGracefulTakeoverProcesses: %+v", err) + } + + log.Infof("GracefulMasterTakeover: Will set %+v as read_only", clusterMaster.Key) + if clusterMaster, err = inst.SetReadOnly(&clusterMaster.Key, true); err != nil { + return nil, nil, err + } + demotedMasterSelfBinlogCoordinates := &clusterMaster.SelfBinlogCoordinates + log.Infof("GracefulMasterTakeover: Will wait for %+v to reach master coordinates %+v", designatedInstance.Key, *demotedMasterSelfBinlogCoordinates) + if designatedInstance, _, err = inst.WaitForExecBinlogCoordinatesToReach(&designatedInstance.Key, demotedMasterSelfBinlogCoordinates, time.Duration(config.Config.ReasonableMaintenanceReplicationLagSeconds)*time.Second); err != nil { + return nil, nil, err + } + promotedMasterCoordinates = &designatedInstance.SelfBinlogCoordinates + + log.Infof("GracefulMasterTakeover: attempting recovery") + recoveryAttempted, topologyRecovery, err := ForceExecuteRecovery(analysisEntry, &designatedInstance.Key, false) + if err != nil { + log.Errorf("GracefulMasterTakeover: noting an error, and for now proceeding: %+v", err) + } + if !recoveryAttempted { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: unexpected error: recovery not attempted. This should not happen") + } + if topologyRecovery == nil { + return nil, nil, fmt.Errorf("GracefulMasterTakeover: recovery attempted but with no results. This should not happen") + } + if topologyRecovery.SuccessorKey == nil { + // Promotion fails. + // Undo setting read-only on original master. + inst.SetReadOnly(&clusterMaster.Key, false) + return nil, nil, fmt.Errorf("GracefulMasterTakeover: Recovery attempted yet no replica promoted; err=%+v", err) + } + var gtidHint inst.OperationGTIDHint = inst.GTIDHintNeutral + if topologyRecovery.RecoveryType == MasterRecoveryGTID { + gtidHint = inst.GTIDHintForce + } + clusterMaster, err = inst.ChangeMasterTo(&clusterMaster.Key, &designatedInstance.Key, promotedMasterCoordinates, false, gtidHint) + if !clusterMaster.SelfBinlogCoordinates.Equals(demotedMasterSelfBinlogCoordinates) { + log.Errorf("GracefulMasterTakeover: sanity problem. Demoted master's coordinates changed from %+v to %+v while supposed to have been frozen", *demotedMasterSelfBinlogCoordinates, clusterMaster.SelfBinlogCoordinates) + } + if !clusterMaster.HasReplicationCredentials && replicationCredentialsError == nil { + _, credentialsErr := inst.ChangeMasterCredentials(&clusterMaster.Key, replicationUser, replicationPassword) + if err == nil { + err = credentialsErr + } + } + if auto { + _, startReplicationErr := inst.StartReplication(&clusterMaster.Key) + if err == nil { + err = startReplicationErr + } + } + + if designatedInstance.AllowTLS { + _, enableSSLErr := inst.EnableMasterSSL(&clusterMaster.Key) + if err == nil { + err = enableSSLErr + } + } + executeProcesses(config.Config.PostGracefulTakeoverProcesses, "PostGracefulTakeoverProcesses", topologyRecovery, false) + + return topologyRecovery, promotedMasterCoordinates, err +} diff --git a/go/vt/orchestrator/logic/topology_recovery_dao.go b/go/vt/orchestrator/logic/topology_recovery_dao.go new file mode 100644 index 0000000000..c586ef03bb --- /dev/null +++ b/go/vt/orchestrator/logic/topology_recovery_dao.go @@ -0,0 +1,862 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package logic + +import ( + "fmt" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/inst" + "vitess.io/vitess/go/vt/orchestrator/process" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +// AttemptFailureDetectionRegistration tries to add a failure-detection entry; if this fails that means the problem has already been detected +func AttemptFailureDetectionRegistration(analysisEntry *inst.ReplicationAnalysis) (registrationSuccessful bool, err error) { + args := sqlutils.Args( + analysisEntry.AnalyzedInstanceKey.Hostname, + analysisEntry.AnalyzedInstanceKey.Port, + process.ThisHostname, + util.ProcessToken.Hash, + string(analysisEntry.Analysis), + analysisEntry.ClusterDetails.ClusterName, + analysisEntry.ClusterDetails.ClusterAlias, + analysisEntry.CountReplicas, + analysisEntry.Replicas.ToCommaDelimitedList(), + analysisEntry.IsActionableRecovery, + ) + startActivePeriodHint := "now()" + if analysisEntry.StartActivePeriod != "" { + startActivePeriodHint = "?" + args = append(args, analysisEntry.StartActivePeriod) + } + + query := fmt.Sprintf(` + insert ignore + into topology_failure_detection ( + hostname, + port, + in_active_period, + end_active_period_unixtime, + processing_node_hostname, + processcing_node_token, + analysis, + cluster_name, + cluster_alias, + count_affected_slaves, + slave_hosts, + is_actionable, + start_active_period + ) values ( + ?, + ?, + 1, + 0, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + %s + ) + `, startActivePeriodHint) + + sqlResult, err := db.ExecOrchestrator(query, args...) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + return (rows > 0), nil +} + +// ClearActiveFailureDetections clears the "in_active_period" flag for old-enough detections, thereby allowing for +// further detections on cleared instances. +func ClearActiveFailureDetections() error { + _, err := db.ExecOrchestrator(` + update topology_failure_detection set + in_active_period = 0, + end_active_period_unixtime = UNIX_TIMESTAMP() + where + in_active_period = 1 + AND start_active_period < NOW() - INTERVAL ? MINUTE + `, + config.Config.FailureDetectionPeriodBlockMinutes, + ) + return log.Errore(err) +} + +// clearAcknowledgedFailureDetections clears the "in_active_period" flag for detections +// that were acknowledged +func clearAcknowledgedFailureDetections(whereClause string, args []interface{}) error { + query := fmt.Sprintf(` + update topology_failure_detection set + in_active_period = 0, + end_active_period_unixtime = UNIX_TIMESTAMP() + where + in_active_period = 1 + and %s + `, whereClause) + _, err := db.ExecOrchestrator(query, args...) + return log.Errore(err) +} + +// AcknowledgeInstanceFailureDetection clears a failure detection for a particular +// instance. This is automated by recovery process: it makes sense to acknowledge +// the detection of an instance just recovered. +func acknowledgeInstanceFailureDetection(instanceKey *inst.InstanceKey) error { + whereClause := ` + hostname = ? + and port = ? + ` + args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) + return clearAcknowledgedFailureDetections(whereClause, args) +} + +func writeTopologyRecovery(topologyRecovery *TopologyRecovery) (*TopologyRecovery, error) { + analysisEntry := topologyRecovery.AnalysisEntry + sqlResult, err := db.ExecOrchestrator(` + insert ignore + into topology_recovery ( + recovery_id, + uid, + hostname, + port, + in_active_period, + start_active_period, + end_active_period_unixtime, + processing_node_hostname, + processcing_node_token, + analysis, + cluster_name, + cluster_alias, + count_affected_slaves, + slave_hosts, + last_detection_id + ) values ( + ?, + ?, + ?, + ?, + 1, + NOW(), + 0, + ?, + ?, + ?, + ?, + ?, + ?, + ?, + (select ifnull(max(detection_id), 0) from topology_failure_detection where hostname=? and port=?) + ) + `, + sqlutils.NilIfZero(topologyRecovery.Id), + topologyRecovery.UID, + analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, + process.ThisHostname, util.ProcessToken.Hash, + string(analysisEntry.Analysis), + analysisEntry.ClusterDetails.ClusterName, + analysisEntry.ClusterDetails.ClusterAlias, + analysisEntry.CountReplicas, analysisEntry.Replicas.ToCommaDelimitedList(), + analysisEntry.AnalyzedInstanceKey.Hostname, analysisEntry.AnalyzedInstanceKey.Port, + ) + if err != nil { + return nil, err + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return nil, err + } + if rows == 0 { + return nil, nil + } + lastInsertId, err := sqlResult.LastInsertId() + if err != nil { + return nil, err + } + topologyRecovery.Id = lastInsertId + return topologyRecovery, nil +} + +// AttemptRecoveryRegistration tries to add a recovery entry; if this fails that means recovery is already in place. +func AttemptRecoveryRegistration(analysisEntry *inst.ReplicationAnalysis, failIfFailedInstanceInActiveRecovery bool, failIfClusterInActiveRecovery bool) (*TopologyRecovery, error) { + if failIfFailedInstanceInActiveRecovery { + // Let's check if this instance has just been promoted recently and is still in active period. + // If so, we reject recovery registration to avoid flapping. + recoveries, err := ReadInActivePeriodSuccessorInstanceRecovery(&analysisEntry.AnalyzedInstanceKey) + if err != nil { + return nil, log.Errore(err) + } + if len(recoveries) > 0 { + RegisterBlockedRecoveries(analysisEntry, recoveries) + return nil, log.Errorf("AttemptRecoveryRegistration: instance %+v has recently been promoted (by failover of %+v) and is in active period. It will not be failed over. You may acknowledge the failure on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) + } + } + if failIfClusterInActiveRecovery { + // Let's check if this cluster has just experienced a failover and is still in active period. + // If so, we reject recovery registration to avoid flapping. + recoveries, err := ReadInActivePeriodClusterRecovery(analysisEntry.ClusterDetails.ClusterName) + if err != nil { + return nil, log.Errore(err) + } + if len(recoveries) > 0 { + RegisterBlockedRecoveries(analysisEntry, recoveries) + return nil, log.Errorf("AttemptRecoveryRegistration: cluster %+v has recently experienced a failover (of %+v) and is in active period. It will not be failed over again. You may acknowledge the failure on this cluster (-c ack-cluster-recoveries) or on %+v (-c ack-instance-recoveries) to remove this blockage", analysisEntry.ClusterDetails.ClusterName, recoveries[0].AnalysisEntry.AnalyzedInstanceKey, recoveries[0].AnalysisEntry.AnalyzedInstanceKey) + } + } + if !failIfFailedInstanceInActiveRecovery { + // Implicitly acknowledge this instance's possibly existing active recovery, provided they are completed. + AcknowledgeInstanceCompletedRecoveries(&analysisEntry.AnalyzedInstanceKey, "orchestrator", fmt.Sprintf("implicit acknowledge due to user invocation of recovery on same instance: %+v", analysisEntry.AnalyzedInstanceKey)) + // The fact we only acknowledge a completed recovery solves the possible case of two DBAs simultaneously + // trying to recover the same instance at the same time + } + + topologyRecovery := NewTopologyRecovery(*analysisEntry) + + topologyRecovery, err := writeTopologyRecovery(topologyRecovery) + if err != nil { + return nil, log.Errore(err) + } + if orcraft.IsRaftEnabled() { + if _, err := orcraft.PublishCommand("write-recovery", topologyRecovery); err != nil { + return nil, log.Errore(err) + } + } + return topologyRecovery, nil +} + +// ClearActiveRecoveries clears the "in_active_period" flag for old-enough recoveries, thereby allowing for +// further recoveries on cleared instances. +func ClearActiveRecoveries() error { + _, err := db.ExecOrchestrator(` + update topology_recovery set + in_active_period = 0, + end_active_period_unixtime = UNIX_TIMESTAMP() + where + in_active_period = 1 + AND start_active_period < NOW() - INTERVAL ? SECOND + `, + config.Config.RecoveryPeriodBlockSeconds, + ) + return log.Errore(err) +} + +// RegisterBlockedRecoveries writes down currently blocked recoveries, and indicates what recovery they are blocked on. +// Recoveries are blocked thru the in_active_period flag, which comes to avoid flapping. +func RegisterBlockedRecoveries(analysisEntry *inst.ReplicationAnalysis, blockingRecoveries []TopologyRecovery) error { + for _, recovery := range blockingRecoveries { + _, err := db.ExecOrchestrator(` + insert + into blocked_topology_recovery ( + hostname, + port, + cluster_name, + analysis, + last_blocked_timestamp, + blocking_recovery_id + ) values ( + ?, + ?, + ?, + ?, + NOW(), + ? + ) + on duplicate key update + cluster_name=values(cluster_name), + analysis=values(analysis), + last_blocked_timestamp=values(last_blocked_timestamp), + blocking_recovery_id=values(blocking_recovery_id) + `, analysisEntry.AnalyzedInstanceKey.Hostname, + analysisEntry.AnalyzedInstanceKey.Port, + analysisEntry.ClusterDetails.ClusterName, + string(analysisEntry.Analysis), + recovery.Id, + ) + if err != nil { + log.Errore(err) + } + } + return nil +} + +// ExpireBlockedRecoveries clears listing of blocked recoveries that are no longer actually blocked. +func ExpireBlockedRecoveries() error { + // Older recovery is acknowledged by now, hence blocked recovery should be released. + // Do NOTE that the data in blocked_topology_recovery is only used for auditing: it is NOT the data + // based on which we make automated decisions. + + query := ` + select + blocked_topology_recovery.hostname, + blocked_topology_recovery.port + from + blocked_topology_recovery + left join topology_recovery on (blocking_recovery_id = topology_recovery.recovery_id and acknowledged = 0) + where + acknowledged is null + ` + expiredKeys := inst.NewInstanceKeyMap() + err := db.QueryOrchestrator(query, sqlutils.Args(), func(m sqlutils.RowMap) error { + key := inst.InstanceKey{Hostname: m.GetString("hostname"), Port: m.GetInt("port")} + expiredKeys.AddKey(key) + return nil + }) + + for _, expiredKey := range expiredKeys.GetInstanceKeys() { + _, err := db.ExecOrchestrator(` + delete + from blocked_topology_recovery + where + hostname = ? + and port = ? + `, + expiredKey.Hostname, expiredKey.Port, + ) + if err != nil { + return log.Errore(err) + } + } + + if err != nil { + return log.Errore(err) + } + // Some oversampling, if a problem has not been noticed for some time (e.g. the server came up alive + // before action was taken), expire it. + // Recall that RegisterBlockedRecoveries continuously updates the last_blocked_timestamp column. + _, err = db.ExecOrchestrator(` + delete + from blocked_topology_recovery + where + last_blocked_timestamp < NOW() - interval ? second + `, (config.RecoveryPollSeconds * 2), + ) + return log.Errore(err) +} + +// acknowledgeRecoveries sets acknowledged* details and clears the in_active_period flags from a set of entries +func acknowledgeRecoveries(owner string, comment string, markEndRecovery bool, whereClause string, args []interface{}) (countAcknowledgedEntries int64, err error) { + additionalSet := `` + if markEndRecovery { + additionalSet = ` + end_recovery=IFNULL(end_recovery, NOW()), + ` + } + query := fmt.Sprintf(` + update topology_recovery set + in_active_period = 0, + end_active_period_unixtime = case when end_active_period_unixtime = 0 then UNIX_TIMESTAMP() else end_active_period_unixtime end, + %s + acknowledged = 1, + acknowledged_at = NOW(), + acknowledged_by = ?, + acknowledge_comment = ? + where + acknowledged = 0 + and + %s + `, additionalSet, whereClause) + args = append(sqlutils.Args(owner, comment), args...) + sqlResult, err := db.ExecOrchestrator(query, args...) + if err != nil { + return 0, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + return rows, log.Errore(err) +} + +// AcknowledgeAllRecoveries acknowledges all unacknowledged recoveries. +func AcknowledgeAllRecoveries(owner string, comment string) (countAcknowledgedEntries int64, err error) { + whereClause := `1 = 1` + return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args()) +} + +// AcknowledgeRecovery acknowledges a particular recovery. +// This also implied clearing their active period, which in turn enables further recoveries on those topologies +func AcknowledgeRecovery(recoveryId int64, owner string, comment string) (countAcknowledgedEntries int64, err error) { + whereClause := `recovery_id = ?` + return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args(recoveryId)) +} + +// AcknowledgeRecovery acknowledges a particular recovery. +// This also implied clearing their active period, which in turn enables further recoveries on those topologies +func AcknowledgeRecoveryByUID(recoveryUID string, owner string, comment string) (countAcknowledgedEntries int64, err error) { + whereClause := `uid = ?` + return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args(recoveryUID)) +} + +// AcknowledgeClusterRecoveries marks active recoveries for given cluster as acknowledged. +// This also implied clearing their active period, which in turn enables further recoveries on those topologies +func AcknowledgeClusterRecoveries(clusterName string, owner string, comment string) (countAcknowledgedEntries int64, err error) { + { + whereClause := `cluster_name = ?` + args := sqlutils.Args(clusterName) + clearAcknowledgedFailureDetections(whereClause, args) + count, err := acknowledgeRecoveries(owner, comment, false, whereClause, args) + if err != nil { + return count, err + } + countAcknowledgedEntries = countAcknowledgedEntries + count + } + { + clusterInfo, err := inst.ReadClusterInfo(clusterName) + whereClause := `cluster_alias = ? and cluster_alias != ''` + args := sqlutils.Args(clusterInfo.ClusterAlias) + clearAcknowledgedFailureDetections(whereClause, args) + count, err := acknowledgeRecoveries(owner, comment, false, whereClause, args) + if err != nil { + return count, err + } + countAcknowledgedEntries = countAcknowledgedEntries + count + + } + return countAcknowledgedEntries, nil +} + +// AcknowledgeInstanceRecoveries marks active recoveries for given instane as acknowledged. +// This also implied clearing their active period, which in turn enables further recoveries on those topologies +func AcknowledgeInstanceRecoveries(instanceKey *inst.InstanceKey, owner string, comment string) (countAcknowledgedEntries int64, err error) { + whereClause := ` + hostname = ? + and port = ? + ` + args := sqlutils.Args(instanceKey.Hostname, instanceKey.Port) + clearAcknowledgedFailureDetections(whereClause, args) + return acknowledgeRecoveries(owner, comment, false, whereClause, args) +} + +// AcknowledgeInstanceCompletedRecoveries marks active and COMPLETED recoveries for given instane as acknowledged. +// This also implied clearing their active period, which in turn enables further recoveries on those topologies +func AcknowledgeInstanceCompletedRecoveries(instanceKey *inst.InstanceKey, owner string, comment string) (countAcknowledgedEntries int64, err error) { + whereClause := ` + hostname = ? + and port = ? + and end_recovery is not null + ` + return acknowledgeRecoveries(owner, comment, false, whereClause, sqlutils.Args(instanceKey.Hostname, instanceKey.Port)) +} + +// AcknowledgeCrashedRecoveries marks recoveries whose processing nodes has crashed as acknowledged. +func AcknowledgeCrashedRecoveries() (countAcknowledgedEntries int64, err error) { + whereClause := ` + in_active_period = 1 + and end_recovery is null + and concat(processing_node_hostname, ':', processcing_node_token) not in ( + select concat(hostname, ':', token) from node_health + ) + ` + return acknowledgeRecoveries("orchestrator", "detected crashed recovery", true, whereClause, sqlutils.Args()) +} + +// ResolveRecovery is called on completion of a recovery process and updates the recovery status. +// It does not clear the "active period" as this still takes place in order to avoid flapping. +func writeResolveRecovery(topologyRecovery *TopologyRecovery) error { + var successorKeyToWrite inst.InstanceKey + if topologyRecovery.IsSuccessful { + successorKeyToWrite = *topologyRecovery.SuccessorKey + } + _, err := db.ExecOrchestrator(` + update topology_recovery set + is_successful = ?, + successor_hostname = ?, + successor_port = ?, + successor_alias = ?, + lost_slaves = ?, + participating_instances = ?, + all_errors = ?, + end_recovery = NOW() + where + uid = ? + `, topologyRecovery.IsSuccessful, successorKeyToWrite.Hostname, successorKeyToWrite.Port, + topologyRecovery.SuccessorAlias, topologyRecovery.LostReplicas.ToCommaDelimitedList(), + topologyRecovery.ParticipatingInstanceKeys.ToCommaDelimitedList(), + strings.Join(topologyRecovery.AllErrors, "\n"), + topologyRecovery.UID, + ) + return log.Errore(err) +} + +// readRecoveries reads recovery entry/audit entries from topology_recovery +func readRecoveries(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) { + res := []TopologyRecovery{} + query := fmt.Sprintf(` + select + recovery_id, + uid, + hostname, + port, + (IFNULL(end_active_period_unixtime, 0) = 0) as is_active, + start_active_period, + IFNULL(end_active_period_unixtime, 0) as end_active_period_unixtime, + IFNULL(end_recovery, '') AS end_recovery, + is_successful, + processing_node_hostname, + processcing_node_token, + ifnull(successor_hostname, '') as successor_hostname, + ifnull(successor_port, 0) as successor_port, + ifnull(successor_alias, '') as successor_alias, + analysis, + cluster_name, + cluster_alias, + count_affected_slaves, + slave_hosts, + participating_instances, + lost_slaves, + all_errors, + acknowledged, + acknowledged_at, + acknowledged_by, + acknowledge_comment, + last_detection_id + from + topology_recovery + %s + order by + recovery_id desc + %s + `, whereCondition, limit) + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + topologyRecovery := *NewTopologyRecovery(inst.ReplicationAnalysis{}) + topologyRecovery.Id = m.GetInt64("recovery_id") + topologyRecovery.UID = m.GetString("uid") + + topologyRecovery.IsActive = m.GetBool("is_active") + topologyRecovery.RecoveryStartTimestamp = m.GetString("start_active_period") + topologyRecovery.RecoveryEndTimestamp = m.GetString("end_recovery") + topologyRecovery.IsSuccessful = m.GetBool("is_successful") + topologyRecovery.ProcessingNodeHostname = m.GetString("processing_node_hostname") + topologyRecovery.ProcessingNodeToken = m.GetString("processcing_node_token") + + topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") + topologyRecovery.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") + topologyRecovery.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) + topologyRecovery.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") + topologyRecovery.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") + topologyRecovery.AnalysisEntry.CountReplicas = m.GetUint("count_affected_slaves") + topologyRecovery.AnalysisEntry.ReadReplicaHostsFromString(m.GetString("slave_hosts")) + + topologyRecovery.SuccessorKey = &inst.InstanceKey{} + topologyRecovery.SuccessorKey.Hostname = m.GetString("successor_hostname") + topologyRecovery.SuccessorKey.Port = m.GetInt("successor_port") + topologyRecovery.SuccessorAlias = m.GetString("successor_alias") + + topologyRecovery.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() + + topologyRecovery.AllErrors = strings.Split(m.GetString("all_errors"), "\n") + topologyRecovery.LostReplicas.ReadCommaDelimitedList(m.GetString("lost_slaves")) + topologyRecovery.ParticipatingInstanceKeys.ReadCommaDelimitedList(m.GetString("participating_instances")) + + topologyRecovery.Acknowledged = m.GetBool("acknowledged") + topologyRecovery.AcknowledgedAt = m.GetString("acknowledged_at") + topologyRecovery.AcknowledgedBy = m.GetString("acknowledged_by") + topologyRecovery.AcknowledgedComment = m.GetString("acknowledge_comment") + + topologyRecovery.LastDetectionId = m.GetInt64("last_detection_id") + + res = append(res, topologyRecovery) + return nil + }) + + return res, log.Errore(err) +} + +// ReadActiveRecoveries reads active recovery entry/audit entries from topology_recovery +func ReadActiveClusterRecovery(clusterName string) ([]TopologyRecovery, error) { + whereClause := ` + where + in_active_period=1 + and end_recovery is null + and cluster_name=?` + return readRecoveries(whereClause, ``, sqlutils.Args(clusterName)) +} + +// ReadInActivePeriodClusterRecovery reads recoveries (possibly complete!) that are in active period. +// (may be used to block further recoveries on this cluster) +func ReadInActivePeriodClusterRecovery(clusterName string) ([]TopologyRecovery, error) { + whereClause := ` + where + in_active_period=1 + and cluster_name=?` + return readRecoveries(whereClause, ``, sqlutils.Args(clusterName)) +} + +// ReadRecentlyActiveClusterRecovery reads recently completed entries for a given cluster +func ReadRecentlyActiveClusterRecovery(clusterName string) ([]TopologyRecovery, error) { + whereClause := ` + where + end_recovery > now() - interval 5 minute + and cluster_name=?` + return readRecoveries(whereClause, ``, sqlutils.Args(clusterName)) +} + +// ReadInActivePeriodSuccessorInstanceRecovery reads completed recoveries for a given instance, where said instance +// was promoted as result, still in active period (may be used to block further recoveries should this instance die) +func ReadInActivePeriodSuccessorInstanceRecovery(instanceKey *inst.InstanceKey) ([]TopologyRecovery, error) { + whereClause := ` + where + in_active_period=1 + and + successor_hostname=? and successor_port=?` + return readRecoveries(whereClause, ``, sqlutils.Args(instanceKey.Hostname, instanceKey.Port)) +} + +// ReadRecentlyActiveInstanceRecovery reads recently completed entries for a given instance +func ReadRecentlyActiveInstanceRecovery(instanceKey *inst.InstanceKey) ([]TopologyRecovery, error) { + whereClause := ` + where + end_recovery > now() - interval 5 minute + and + successor_hostname=? and successor_port=?` + return readRecoveries(whereClause, ``, sqlutils.Args(instanceKey.Hostname, instanceKey.Port)) +} + +// ReadActiveRecoveries reads active recovery entry/audit entries from topology_recovery +func ReadActiveRecoveries() ([]TopologyRecovery, error) { + return readRecoveries(` + where + in_active_period=1 + and end_recovery is null`, + ``, sqlutils.Args()) +} + +// ReadCompletedRecoveries reads completed recovery entry/audit entries from topology_recovery +func ReadCompletedRecoveries(page int) ([]TopologyRecovery, error) { + limit := ` + limit ? + offset ?` + return readRecoveries(`where end_recovery is not null`, limit, sqlutils.Args(config.AuditPageSize, page*config.AuditPageSize)) +} + +// ReadRecovery reads completed recovery entry/audit entries from topology_recovery +func ReadRecovery(recoveryId int64) ([]TopologyRecovery, error) { + whereClause := `where recovery_id = ?` + return readRecoveries(whereClause, ``, sqlutils.Args(recoveryId)) +} + +// ReadRecoveryByUID reads completed recovery entry/audit entries from topology_recovery +func ReadRecoveryByUID(recoveryUID string) ([]TopologyRecovery, error) { + whereClause := `where uid = ?` + return readRecoveries(whereClause, ``, sqlutils.Args(recoveryUID)) +} + +// ReadCRecoveries reads latest recovery entries from topology_recovery +func ReadRecentRecoveries(clusterName string, clusterAlias string, unacknowledgedOnly bool, page int) ([]TopologyRecovery, error) { + whereConditions := []string{} + whereClause := "" + args := sqlutils.Args() + if unacknowledgedOnly { + whereConditions = append(whereConditions, `acknowledged=0`) + } + if clusterName != "" { + whereConditions = append(whereConditions, `cluster_name=?`) + args = append(args, clusterName) + } else if clusterAlias != "" { + whereConditions = append(whereConditions, `cluster_alias=?`) + args = append(args, clusterAlias) + } + if len(whereConditions) > 0 { + whereClause = fmt.Sprintf("where %s", strings.Join(whereConditions, " and ")) + } + limit := ` + limit ? + offset ?` + args = append(args, config.AuditPageSize, page*config.AuditPageSize) + return readRecoveries(whereClause, limit, args) +} + +// readRecoveries reads recovery entry/audit entries from topology_recovery +func readFailureDetections(whereCondition string, limit string, args []interface{}) ([]TopologyRecovery, error) { + res := []TopologyRecovery{} + query := fmt.Sprintf(` + select + detection_id, + hostname, + port, + in_active_period as is_active, + start_active_period, + end_active_period_unixtime, + processing_node_hostname, + processcing_node_token, + analysis, + cluster_name, + cluster_alias, + count_affected_slaves, + slave_hosts, + (select max(recovery_id) from topology_recovery where topology_recovery.last_detection_id = detection_id) as related_recovery_id + from + topology_failure_detection + %s + order by + detection_id desc + %s + `, whereCondition, limit) + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + failureDetection := TopologyRecovery{} + failureDetection.Id = m.GetInt64("detection_id") + + failureDetection.IsActive = m.GetBool("is_active") + failureDetection.RecoveryStartTimestamp = m.GetString("start_active_period") + failureDetection.ProcessingNodeHostname = m.GetString("processing_node_hostname") + failureDetection.ProcessingNodeToken = m.GetString("processcing_node_token") + + failureDetection.AnalysisEntry.AnalyzedInstanceKey.Hostname = m.GetString("hostname") + failureDetection.AnalysisEntry.AnalyzedInstanceKey.Port = m.GetInt("port") + failureDetection.AnalysisEntry.Analysis = inst.AnalysisCode(m.GetString("analysis")) + failureDetection.AnalysisEntry.ClusterDetails.ClusterName = m.GetString("cluster_name") + failureDetection.AnalysisEntry.ClusterDetails.ClusterAlias = m.GetString("cluster_alias") + failureDetection.AnalysisEntry.CountReplicas = m.GetUint("count_affected_slaves") + failureDetection.AnalysisEntry.ReadReplicaHostsFromString(m.GetString("slave_hosts")) + failureDetection.AnalysisEntry.StartActivePeriod = m.GetString("start_active_period") + + failureDetection.RelatedRecoveryId = m.GetInt64("related_recovery_id") + + failureDetection.AnalysisEntry.ClusterDetails.ReadRecoveryInfo() + + res = append(res, failureDetection) + return nil + }) + + return res, log.Errore(err) +} + +// ReadRecentFailureDetections +func ReadRecentFailureDetections(clusterAlias string, page int) ([]TopologyRecovery, error) { + whereClause := "" + args := sqlutils.Args() + if clusterAlias != "" { + whereClause = `where cluster_alias = ?` + args = append(args, clusterAlias) + } + limit := ` + limit ? + offset ?` + args = append(args, config.AuditPageSize, page*config.AuditPageSize) + return readFailureDetections(whereClause, limit, args) +} + +// ReadFailureDetection +func ReadFailureDetection(detectionId int64) ([]TopologyRecovery, error) { + whereClause := `where detection_id = ?` + return readFailureDetections(whereClause, ``, sqlutils.Args(detectionId)) +} + +// ReadBlockedRecoveries reads blocked recovery entries, potentially filtered by cluster name (empty to unfilter) +func ReadBlockedRecoveries(clusterName string) ([]BlockedTopologyRecovery, error) { + res := []BlockedTopologyRecovery{} + whereClause := "" + args := sqlutils.Args() + if clusterName != "" { + whereClause = `where cluster_name = ?` + args = append(args, clusterName) + } + query := fmt.Sprintf(` + select + hostname, + port, + cluster_name, + analysis, + last_blocked_timestamp, + blocking_recovery_id + from + blocked_topology_recovery + %s + order by + last_blocked_timestamp desc + `, whereClause) + err := db.QueryOrchestrator(query, args, func(m sqlutils.RowMap) error { + blockedTopologyRecovery := BlockedTopologyRecovery{} + blockedTopologyRecovery.FailedInstanceKey.Hostname = m.GetString("hostname") + blockedTopologyRecovery.FailedInstanceKey.Port = m.GetInt("port") + blockedTopologyRecovery.ClusterName = m.GetString("cluster_name") + blockedTopologyRecovery.Analysis = inst.AnalysisCode(m.GetString("analysis")) + blockedTopologyRecovery.LastBlockedTimestamp = m.GetString("last_blocked_timestamp") + blockedTopologyRecovery.BlockingRecoveryId = m.GetInt64("blocking_recovery_id") + + res = append(res, blockedTopologyRecovery) + return nil + }) + + return res, log.Errore(err) +} + +// writeTopologyRecoveryStep writes down a single step in a recovery process +func writeTopologyRecoveryStep(topologyRecoveryStep *TopologyRecoveryStep) error { + sqlResult, err := db.ExecOrchestrator(` + insert ignore + into topology_recovery_steps ( + recovery_step_id, recovery_uid, audit_at, message + ) values (?, ?, now(), ?) + `, sqlutils.NilIfZero(topologyRecoveryStep.Id), topologyRecoveryStep.RecoveryUID, topologyRecoveryStep.Message, + ) + if err != nil { + return log.Errore(err) + } + topologyRecoveryStep.Id, err = sqlResult.LastInsertId() + return log.Errore(err) +} + +// ReadTopologyRecoverySteps reads recovery steps for a given recovery +func ReadTopologyRecoverySteps(recoveryUID string) ([]TopologyRecoveryStep, error) { + res := []TopologyRecoveryStep{} + query := ` + select + recovery_step_id, recovery_uid, audit_at, message + from + topology_recovery_steps + where + recovery_uid=? + order by + recovery_step_id asc + ` + err := db.QueryOrchestrator(query, sqlutils.Args(recoveryUID), func(m sqlutils.RowMap) error { + recoveryStep := TopologyRecoveryStep{} + recoveryStep.RecoveryUID = recoveryUID + recoveryStep.Id = m.GetInt64("recovery_step_id") + recoveryStep.AuditAt = m.GetString("audit_at") + recoveryStep.Message = m.GetString("message") + + res = append(res, recoveryStep) + return nil + }) + return res, log.Errore(err) +} + +// ExpireFailureDetectionHistory removes old rows from the topology_failure_detection table +func ExpireFailureDetectionHistory() error { + return inst.ExpireTableData("topology_failure_detection", "start_active_period") +} + +// ExpireTopologyRecoveryHistory removes old rows from the topology_failure_detection table +func ExpireTopologyRecoveryHistory() error { + return inst.ExpireTableData("topology_recovery", "start_active_period") +} + +// ExpireTopologyRecoveryStepsHistory removes old rows from the topology_failure_detection table +func ExpireTopologyRecoveryStepsHistory() error { + return inst.ExpireTableData("topology_recovery_steps", "audit_at") +} diff --git a/go/vt/orchestrator/metrics/graphite.go b/go/vt/orchestrator/metrics/graphite.go new file mode 100644 index 0000000000..195b7e5d81 --- /dev/null +++ b/go/vt/orchestrator/metrics/graphite.go @@ -0,0 +1,58 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package metrics + +import ( + "net" + "strings" + "time" + + graphite "github.com/cyberdelia/go-metrics-graphite" + "github.com/rcrowley/go-metrics" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/process" +) + +// InitGraphiteMetrics is called once in the lifetime of the app, after config has been loaded +func InitGraphiteMetrics() error { + if config.Config.GraphiteAddr == "" { + return nil + } + if config.Config.GraphitePollSeconds <= 0 { + return nil + } + if config.Config.GraphitePath == "" { + return log.Errorf("No graphite path provided (see GraphitePath config variable). Will not log to graphite") + } + addr, err := net.ResolveTCPAddr("tcp", config.Config.GraphiteAddr) + if err != nil { + return log.Errore(err) + } + graphitePathHostname := process.ThisHostname + if config.Config.GraphiteConvertHostnameDotsToUnderscores { + graphitePathHostname = strings.Replace(graphitePathHostname, ".", "_", -1) + } + graphitePath := config.Config.GraphitePath + graphitePath = strings.Replace(graphitePath, "{hostname}", graphitePathHostname, -1) + + log.Debugf("Will log to graphite on %+v, %+v", config.Config.GraphiteAddr, graphitePath) + + go graphite.Graphite(metrics.DefaultRegistry, 1*time.Minute, graphitePath, addr) + + return nil +} diff --git a/go/vt/orchestrator/metrics/metrics.go b/go/vt/orchestrator/metrics/metrics.go new file mode 100644 index 0000000000..468a08c346 --- /dev/null +++ b/go/vt/orchestrator/metrics/metrics.go @@ -0,0 +1,43 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package metrics + +import ( + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" +) + +var matricTickCallbacks [](func()) + +// InitMetrics is called once in the lifetime of the app, after config has been loaded +func InitMetrics() error { + go func() { + metricsCallbackTick := time.Tick(time.Duration(config.DebugMetricsIntervalSeconds) * time.Second) + for range metricsCallbackTick { + for _, f := range matricTickCallbacks { + go f() + } + } + }() + + return nil +} + +func OnMetricsTick(f func()) { + matricTickCallbacks = append(matricTickCallbacks, f) +} diff --git a/go/vt/orchestrator/metrics/query/aggregated.go b/go/vt/orchestrator/metrics/query/aggregated.go new file mode 100644 index 0000000000..f6aa0418e6 --- /dev/null +++ b/go/vt/orchestrator/metrics/query/aggregated.go @@ -0,0 +1,76 @@ +// Package query provdes query metrics with this file providing +// aggregared metrics based on the underlying values. +package query + +import ( + "time" + + "github.com/montanaflynn/stats" + + "vitess.io/vitess/go/vt/orchestrator/collection" +) + +type AggregatedQueryMetrics struct { + // fill me in here + Count int + MaxLatencySeconds float64 + MeanLatencySeconds float64 + MedianLatencySeconds float64 + P95LatencySeconds float64 + MaxWaitSeconds float64 + MeanWaitSeconds float64 + MedianWaitSeconds float64 + P95WaitSeconds float64 +} + +// AggregatedSince returns the aggregated query metrics for the period +// given from the values provided. +func AggregatedSince(c *collection.Collection, t time.Time) AggregatedQueryMetrics { + + // Initialise timing metrics + var waitTimings []float64 + var queryTimings []float64 + + // Retrieve values since the time specified + values, err := c.Since(t) + a := AggregatedQueryMetrics{} + if err != nil { + return a // empty data + } + + // generate the metrics + for _, v := range values { + waitTimings = append(waitTimings, v.(*Metric).WaitLatency.Seconds()) + queryTimings = append(queryTimings, v.(*Metric).ExecuteLatency.Seconds()) + } + + a.Count = len(waitTimings) + + // generate aggregate values + if s, err := stats.Max(stats.Float64Data(waitTimings)); err == nil { + a.MaxWaitSeconds = s + } + if s, err := stats.Mean(stats.Float64Data(waitTimings)); err == nil { + a.MeanWaitSeconds = s + } + if s, err := stats.Median(stats.Float64Data(waitTimings)); err == nil { + a.MedianWaitSeconds = s + } + if s, err := stats.Percentile(stats.Float64Data(waitTimings), 95); err == nil { + a.P95WaitSeconds = s + } + if s, err := stats.Max(stats.Float64Data(queryTimings)); err == nil { + a.MaxLatencySeconds = s + } + if s, err := stats.Mean(stats.Float64Data(queryTimings)); err == nil { + a.MeanLatencySeconds = s + } + if s, err := stats.Median(stats.Float64Data(queryTimings)); err == nil { + a.MedianLatencySeconds = s + } + if s, err := stats.Percentile(stats.Float64Data(queryTimings), 95); err == nil { + a.P95LatencySeconds = s + } + + return a +} diff --git a/go/vt/orchestrator/metrics/query/metric.go b/go/vt/orchestrator/metrics/query/metric.go new file mode 100644 index 0000000000..a8a647f31b --- /dev/null +++ b/go/vt/orchestrator/metrics/query/metric.go @@ -0,0 +1,51 @@ +/* + Copyright 2017 Simon J Mudd + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package query + +/* + query holds information about query metrics and records the time taken + waiting before doing the query plus the time taken executing the query. +*/ +import ( + "time" +) + +// Metric records query metrics of backend writes that go through +// a sized channel. It allows us to compare the time waiting to +// execute the query against the time needed to run it and in a +// "sized channel" the wait time may be significant and is good to +// measure. +type Metric struct { + Timestamp time.Time // time the metric was started + WaitLatency time.Duration // time that we had to wait before starting query execution + ExecuteLatency time.Duration // time the query took to execute + Err error // any error resulting from the query execution +} + +// NewMetric returns a new metric with timestamp starting from now +func NewMetric() *Metric { + bqm := &Metric{ + Timestamp: time.Now(), + } + + return bqm +} + +// When records the timestamp of the start of the recording +func (m Metric) When() time.Time { + return m.Timestamp +} diff --git a/go/vt/orchestrator/os/process.go b/go/vt/orchestrator/os/process.go new file mode 100644 index 0000000000..27afaf15cb --- /dev/null +++ b/go/vt/orchestrator/os/process.go @@ -0,0 +1,90 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package os + +import ( + "fmt" + "io/ioutil" + "os" + "os/exec" + "strings" + "syscall" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +var EmptyEnv = []string{} + +// CommandRun executes some text as a command. This is assumed to be +// text that will be run by a shell so we need to write out the +// command to a temporary file and then ask the shell to execute +// it, after which the temporary file is removed. +func CommandRun(commandText string, env []string, arguments ...string) error { + // show the actual command we have been asked to run + log.Infof("CommandRun(%v,%+v)", commandText, arguments) + + cmd, shellScript, err := generateShellScript(commandText, env, arguments...) + defer os.Remove(shellScript) + if err != nil { + return log.Errore(err) + } + + var waitStatus syscall.WaitStatus + + log.Infof("CommandRun/running: %s", strings.Join(cmd.Args, " ")) + cmdOutput, err := cmd.CombinedOutput() + log.Infof("CommandRun: %s\n", string(cmdOutput)) + if err != nil { + // Did the command fail because of an unsuccessful exit code + if exitError, ok := err.(*exec.ExitError); ok { + waitStatus = exitError.Sys().(syscall.WaitStatus) + log.Errorf("CommandRun: failed. exit status %d", waitStatus.ExitStatus()) + } + + return log.Errore(fmt.Errorf("(%s) %s", err.Error(), cmdOutput)) + } + + // Command was successful + waitStatus = cmd.ProcessState.Sys().(syscall.WaitStatus) + log.Infof("CommandRun successful. exit status %d", waitStatus.ExitStatus()) + + return nil +} + +// generateShellScript generates a temporary shell script based on +// the given command to be executed, writes the command to a temporary +// file and returns the exec.Command which can be executed together +// with the script name that was created. +func generateShellScript(commandText string, env []string, arguments ...string) (*exec.Cmd, string, error) { + shell := config.Config.ProcessesShellCommand + + commandBytes := []byte(commandText) + tmpFile, err := ioutil.TempFile("", "orchestrator-process-cmd-") + if err != nil { + return nil, "", log.Errorf("generateShellScript() failed to create TempFile: %v", err.Error()) + } + // write commandText to temporary file + ioutil.WriteFile(tmpFile.Name(), commandBytes, 0640) + shellArguments := append([]string{}, tmpFile.Name()) + shellArguments = append(shellArguments, arguments...) + + cmd := exec.Command(shell, shellArguments...) + cmd.Env = env + + return cmd, tmpFile.Name(), nil +} diff --git a/go/vt/orchestrator/os/process_test.go b/go/vt/orchestrator/os/process_test.go new file mode 100644 index 0000000000..c2b450a288 --- /dev/null +++ b/go/vt/orchestrator/os/process_test.go @@ -0,0 +1,18 @@ +package os + +import ( + "fmt" + "testing" +) + +func TestCommandRun(t *testing.T) { + cmdErr := CommandRun("echo \"VAR1=$VAR1 VAR2=$VAR2\" && exit 11", []string{"VAR1=a", "VAR2=b"}) + if cmdErr == nil { + t.Error("Expected CommandRun to fail, but no error returned") + } + + expectedMsg := "(exit status 11) VAR1=a VAR2=b\n" + if cmdErr.Error() != expectedMsg { + t.Errorf(fmt.Sprintf("Expected CommandRun to return an Error '%s' but got '%s'", expectedMsg, cmdErr.Error())) + } +} diff --git a/go/vt/orchestrator/os/unixcheck.go b/go/vt/orchestrator/os/unixcheck.go new file mode 100644 index 0000000000..2033e5bd78 --- /dev/null +++ b/go/vt/orchestrator/os/unixcheck.go @@ -0,0 +1,68 @@ +/* + Copyright 2017 Simon Mudd, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package os + +import ( + "os/user" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +// UserInGroups checks if the given username is in the given unix +// groups. It might be worth caching this for performance reasons. +func UserInGroups(authUser string, powerAuthGroups []string) bool { + // these conditions are treated as false + if authUser == "" || len(powerAuthGroups) == 0 { + return false + } + + // make a map (likely to have only one group maybe) of power groups + powerGroupMap := make(map[string]bool) + for _, v := range powerAuthGroups { + powerGroupMap[v] = true + } + + currentUser, err := user.Lookup(authUser) + if err != nil { + // The user not being known is not an error so don't report this. + // ERROR Failed to lookup user "simon": user: unknown user simon + if !strings.Contains(err.Error(), "unknown user") { + log.Errorf("Failed to lookup user %q: %v", authUser, err) + } + return false + } + gids, err := currentUser.GroupIds() + if err != nil { + log.Errorf("Failed to lookup groupids for user %q: %v", authUser, err) + return false + } + // get the group name from the id and check if the name is in powerGroupMap + for _, gid := range gids { + group, err := user.LookupGroupId(gid) + if err != nil { + log.Errorf("Failed to lookup group id for gid %s: %v", gid, err) // yes gids are strings! + return false + } + + if _, found := powerGroupMap[group.Name]; found { + return true + } + } + + return false +} diff --git a/go/vt/orchestrator/os/unixcheck_test.go b/go/vt/orchestrator/os/unixcheck_test.go new file mode 100644 index 0000000000..0121f3c503 --- /dev/null +++ b/go/vt/orchestrator/os/unixcheck_test.go @@ -0,0 +1,49 @@ +/* + Copyright 2017 Simon Mudd, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ +package os + +import ( + "testing" +) + +type testCase struct { + user string + powerUsers []string + expected bool +} + +var testCases []testCase + +func init() { + // It is hard to come up with good results that will work on all systems + // so the tests are limited but should work on most Linux or OSX systems. + // If you find a case where the tests fail due to user differences please + // adjust the test cases appropriately. + testCases = []testCase{ + {"root", []string{"root", "wheel"}, true}, + {"root", []string{"not_in_this_group"}, false}, + {"not_found_user", []string{"not_in_this_group"}, false}, + } +} + +// test the users etc +func TestUsers(t *testing.T) { + for _, v := range testCases { + if got := UserInGroups(v.user, v.powerUsers); got != v.expected { + t.Errorf("userInGroups(%q,%+v) failed. Got %v, Expected %v", v.user, v.powerUsers, got, v.expected) + } + } +} diff --git a/go/vt/orchestrator/process/access_token_dao.go b/go/vt/orchestrator/process/access_token_dao.go new file mode 100644 index 0000000000..7e09fdc070 --- /dev/null +++ b/go/vt/orchestrator/process/access_token_dao.go @@ -0,0 +1,124 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package process + +import ( + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +// GenerateAccessToken attempts to generate a new access token and returns the public +// part of the token +func GenerateAccessToken(owner string) (publicToken string, err error) { + publicToken = util.NewToken().Hash + secretToken := util.NewToken().Hash + + _, err = db.ExecOrchestrator(` + insert into access_token ( + public_token, secret_token, generated_at, generated_by, is_acquired, is_reentrant + ) values ( + ?, ?, now(), ?, 0, 0 + ) + `, + publicToken, secretToken, owner, + ) + if err != nil { + return publicToken, log.Errore(err) + } + return publicToken, nil +} + +// AcquireAccessToken attempts to acquire a hopefully free token; returning in such case +// the secretToken as proof of ownership. +func AcquireAccessToken(publicToken string) (secretToken string, err error) { + secretToken = "" + sqlResult, err := db.ExecOrchestrator(` + update access_token + set + is_acquired=1, + acquired_at=now() + where + public_token=? + and ( + ( + is_acquired=0 + and generated_at > now() - interval ? second + ) + or is_reentrant=1 + ) + `, + publicToken, config.Config.AccessTokenUseExpirySeconds, + ) + if err != nil { + return secretToken, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return secretToken, log.Errore(err) + } + if rows == 0 { + return secretToken, log.Errorf("Cannot acquire token %s", publicToken) + } + // Seems like we made it! + query := ` + select secret_token from access_token where public_token=? + ` + err = db.QueryOrchestrator(query, sqlutils.Args(publicToken), func(m sqlutils.RowMap) error { + secretToken = m.GetString("secret_token") + return nil + }) + return secretToken, log.Errore(err) +} + +// TokenIsValid checks to see whether a given token exists and is not outdated. +func TokenIsValid(publicToken string, secretToken string) (result bool, err error) { + query := ` + select + count(*) as valid_token + from + access_token + where + public_token=? + and secret_token=? + and ( + generated_at >= now() - interval ? minute + or is_reentrant = 1 + ) + ` + err = db.QueryOrchestrator(query, sqlutils.Args(publicToken, secretToken, config.Config.AccessTokenExpiryMinutes), func(m sqlutils.RowMap) error { + result = m.GetInt("valid_token") > 0 + return nil + }) + return result, log.Errore(err) +} + +// ExpireAccessTokens removes old, known to be uneligible tokens +func ExpireAccessTokens() error { + _, err := db.ExecOrchestrator(` + delete + from access_token + where + generated_at < now() - interval ? minute + and is_reentrant = 0 + `, + config.Config.AccessTokenExpiryMinutes, + ) + return log.Errore(err) +} diff --git a/go/vt/orchestrator/process/election_dao.go b/go/vt/orchestrator/process/election_dao.go new file mode 100644 index 0000000000..e121e99ef9 --- /dev/null +++ b/go/vt/orchestrator/process/election_dao.go @@ -0,0 +1,155 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package process + +import ( + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" + "vitess.io/vitess/go/vt/orchestrator/util" +) + +// AttemptElection tries to grab leadership (become active node) +func AttemptElection() (bool, error) { + { + sqlResult, err := db.ExecOrchestrator(` + insert ignore into active_node ( + anchor, hostname, token, first_seen_active, last_seen_active + ) values ( + 1, ?, ?, now(), now() + ) + `, + ThisHostname, util.ProcessToken.Hash, + ) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + if rows > 0 { + // We managed to insert a row + return true, nil + } + } + { + // takeover from a node that has been inactive + sqlResult, err := db.ExecOrchestrator(` + update active_node set + hostname = ?, + token = ?, + first_seen_active=now(), + last_seen_active=now() + where + anchor = 1 + and last_seen_active < (now() - interval ? second) + `, + ThisHostname, util.ProcessToken.Hash, config.ActiveNodeExpireSeconds, + ) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + if rows > 0 { + // We managed to update a row: overtaking a previous leader + return true, nil + } + } + { + // Update last_seen_active is this very node is already the active node + sqlResult, err := db.ExecOrchestrator(` + update active_node set + last_seen_active=now() + where + anchor = 1 + and hostname = ? + and token = ? + `, + ThisHostname, util.ProcessToken.Hash, + ) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + if rows > 0 { + // Reaffirmed our own leadership + return true, nil + } + } + return false, nil +} + +// GrabElection forcibly grabs leadership. Use with care!! +func GrabElection() error { + if orcraft.IsRaftEnabled() { + return log.Errorf("Cannot GrabElection on raft setup") + } + _, err := db.ExecOrchestrator(` + replace into active_node ( + anchor, hostname, token, first_seen_active, last_seen_active + ) values ( + 1, ?, ?, now(), now() + ) + `, + ThisHostname, util.ProcessToken.Hash, + ) + return log.Errore(err) +} + +// Reelect clears the way for re-elections. Active node is immediately demoted. +func Reelect() error { + if orcraft.IsRaftEnabled() { + orcraft.StepDown() + } + _, err := db.ExecOrchestrator(`delete from active_node where anchor = 1`) + return log.Errore(err) +} + +// ElectedNode returns the details of the elected node, as well as answering the question "is this process the elected one"? +func ElectedNode() (node NodeHealth, isElected bool, err error) { + query := ` + select + hostname, + token, + first_seen_active, + last_seen_Active + from + active_node + where + anchor = 1 + ` + err = db.QueryOrchestratorRowsMap(query, func(m sqlutils.RowMap) error { + node.Hostname = m.GetString("hostname") + node.Token = m.GetString("token") + node.FirstSeenActive = m.GetString("first_seen_active") + node.LastSeenActive = m.GetString("last_seen_active") + + return nil + }) + + isElected = (node.Hostname == ThisHostname && node.Token == util.ProcessToken.Hash) + return node, isElected, log.Errore(err) +} diff --git a/go/vt/orchestrator/process/health.go b/go/vt/orchestrator/process/health.go new file mode 100644 index 0000000000..cc1dd9f8e9 --- /dev/null +++ b/go/vt/orchestrator/process/health.go @@ -0,0 +1,190 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package process + +import ( + "sync" + "sync/atomic" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/util" + + "github.com/patrickmn/go-cache" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + orcraft "vitess.io/vitess/go/vt/orchestrator/raft" +) + +var lastHealthCheckUnixNano int64 +var lastGoodHealthCheckUnixNano int64 +var LastContinousCheckHealthy int64 + +var lastHealthCheckCache = cache.New(config.HealthPollSeconds*time.Second, time.Second) + +type NodeHealth struct { + Hostname string + Token string + AppVersion string + FirstSeenActive string + LastSeenActive string + ExtraInfo string + Command string + DBBackend string + + LastReported time.Time + onceHistory sync.Once + onceUpdate sync.Once +} + +func NewNodeHealth() *NodeHealth { + return &NodeHealth{ + Hostname: ThisHostname, + Token: util.ProcessToken.Hash, + AppVersion: config.RuntimeCLIFlags.ConfiguredVersion, + } +} + +func (nodeHealth *NodeHealth) Update() *NodeHealth { + nodeHealth.onceUpdate.Do(func() { + nodeHealth.Hostname = ThisHostname + nodeHealth.Token = util.ProcessToken.Hash + nodeHealth.AppVersion = config.RuntimeCLIFlags.ConfiguredVersion + }) + nodeHealth.LastReported = time.Now() + return nodeHealth +} + +var ThisNodeHealth = NewNodeHealth() + +type HealthStatus struct { + Healthy bool + Hostname string + Token string + IsActiveNode bool + ActiveNode NodeHealth + Error error + AvailableNodes [](*NodeHealth) + RaftLeader string + IsRaftLeader bool + RaftLeaderURI string + RaftAdvertise string + RaftHealthyMembers []string +} + +type OrchestratorExecutionMode string + +const ( + OrchestratorExecutionCliMode OrchestratorExecutionMode = "CLIMode" + OrchestratorExecutionHttpMode = "HttpMode" +) + +var continuousRegistrationOnce sync.Once + +func RegisterNode(nodeHealth *NodeHealth) (healthy bool, err error) { + nodeHealth.Update() + healthy, err = WriteRegisterNode(nodeHealth) + atomic.StoreInt64(&lastHealthCheckUnixNano, time.Now().UnixNano()) + if healthy { + atomic.StoreInt64(&lastGoodHealthCheckUnixNano, time.Now().UnixNano()) + } + return healthy, err +} + +// HealthTest attempts to write to the backend database and get a result +func HealthTest() (health *HealthStatus, err error) { + cacheKey := util.ProcessToken.Hash + if healthStatus, found := lastHealthCheckCache.Get(cacheKey); found { + return healthStatus.(*HealthStatus), nil + } + + health = &HealthStatus{Healthy: false, Hostname: ThisHostname, Token: util.ProcessToken.Hash} + defer lastHealthCheckCache.Set(cacheKey, health, cache.DefaultExpiration) + + if healthy, err := RegisterNode(ThisNodeHealth); err != nil { + health.Error = err + return health, log.Errore(err) + } else { + health.Healthy = healthy + } + + if orcraft.IsRaftEnabled() { + health.ActiveNode.Hostname = orcraft.GetLeader() + health.IsActiveNode = orcraft.IsLeader() + health.RaftLeader = orcraft.GetLeader() + health.RaftLeaderURI = orcraft.LeaderURI.Get() + health.IsRaftLeader = orcraft.IsLeader() + health.RaftAdvertise = config.Config.RaftAdvertise + health.RaftHealthyMembers = orcraft.HealthyMembers() + } else { + if health.ActiveNode, health.IsActiveNode, err = ElectedNode(); err != nil { + health.Error = err + return health, log.Errore(err) + } + } + health.AvailableNodes, err = ReadAvailableNodes(true) + + return health, nil +} + +func SinceLastHealthCheck() time.Duration { + timeNano := atomic.LoadInt64(&lastHealthCheckUnixNano) + if timeNano == 0 { + return 0 + } + return time.Since(time.Unix(0, timeNano)) +} + +func SinceLastGoodHealthCheck() time.Duration { + timeNano := atomic.LoadInt64(&lastGoodHealthCheckUnixNano) + if timeNano == 0 { + return 0 + } + return time.Since(time.Unix(0, timeNano)) +} + +// ContinuousRegistration will continuously update the node_health +// table showing that the current process is still running. +func ContinuousRegistration(extraInfo string, command string) { + ThisNodeHealth.ExtraInfo = extraInfo + ThisNodeHealth.Command = command + continuousRegistrationOnce.Do(func() { + tickOperation := func() { + healthy, err := RegisterNode(ThisNodeHealth) + if err != nil { + log.Errorf("ContinuousRegistration: RegisterNode failed: %+v", err) + } + if healthy { + atomic.StoreInt64(&LastContinousCheckHealthy, 1) + } else { + atomic.StoreInt64(&LastContinousCheckHealthy, 0) + } + } + // First one is synchronous + tickOperation() + go func() { + registrationTick := time.Tick(config.HealthPollSeconds * time.Second) + for range registrationTick { + // We already run inside a go-routine so + // do not do this asynchronously. If we + // get stuck then we don't want to fill up + // the backend pool with connections running + // this maintenance operation. + tickOperation() + } + }() + }) +} diff --git a/go/vt/orchestrator/process/health_dao.go b/go/vt/orchestrator/process/health_dao.go new file mode 100644 index 0000000000..8984f4a98a --- /dev/null +++ b/go/vt/orchestrator/process/health_dao.go @@ -0,0 +1,195 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package process + +import ( + "time" + + "fmt" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/db" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" +) + +// RegisterNode writes down this node in the node_health table +func WriteRegisterNode(nodeHealth *NodeHealth) (healthy bool, err error) { + timeNow := time.Now() + reportedAgo := timeNow.Sub(nodeHealth.LastReported) + reportedSecondsAgo := int64(reportedAgo.Seconds()) + if reportedSecondsAgo > config.HealthPollSeconds*2 { + // This entry is too old. No reason to persist it; already expired. + return false, nil + } + + nodeHealth.onceHistory.Do(func() { + db.ExecOrchestrator(` + insert ignore into node_health_history + (hostname, token, first_seen_active, extra_info, command, app_version) + values + (?, ?, NOW(), ?, ?, ?) + `, + nodeHealth.Hostname, nodeHealth.Token, nodeHealth.ExtraInfo, nodeHealth.Command, + nodeHealth.AppVersion, + ) + }) + { + sqlResult, err := db.ExecOrchestrator(` + update node_health set + last_seen_active = now() - interval ? second, + extra_info = case when ? != '' then ? else extra_info end, + app_version = ?, + incrementing_indicator = incrementing_indicator + 1 + where + hostname = ? + and token = ? + `, + reportedSecondsAgo, + nodeHealth.ExtraInfo, nodeHealth.ExtraInfo, + nodeHealth.AppVersion, + nodeHealth.Hostname, nodeHealth.Token, + ) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + if rows > 0 { + return true, nil + } + } + // Got here? The UPDATE didn't work. Row isn't there. + { + dbBackend := "" + if config.Config.IsSQLite() { + dbBackend = config.Config.SQLite3DataFile + } else { + dbBackend = fmt.Sprintf("%s:%d", config.Config.MySQLOrchestratorHost, + config.Config.MySQLOrchestratorPort) + } + sqlResult, err := db.ExecOrchestrator(` + insert ignore into node_health + (hostname, token, first_seen_active, last_seen_active, extra_info, command, app_version, db_backend) + values ( + ?, ?, + now() - interval ? second, now() - interval ? second, + ?, ?, ?, ?) + `, + nodeHealth.Hostname, nodeHealth.Token, + reportedSecondsAgo, reportedSecondsAgo, + nodeHealth.ExtraInfo, nodeHealth.Command, + nodeHealth.AppVersion, dbBackend, + ) + if err != nil { + return false, log.Errore(err) + } + rows, err := sqlResult.RowsAffected() + if err != nil { + return false, log.Errore(err) + } + if rows > 0 { + return true, nil + } + } + return false, nil +} + +// ExpireAvailableNodes is an aggressive purging method to remove +// node entries who have skipped their keepalive for two times. +func ExpireAvailableNodes() { + _, err := db.ExecOrchestrator(` + delete + from node_health + where + last_seen_active < now() - interval ? second + `, + config.HealthPollSeconds*5, + ) + if err != nil { + log.Errorf("ExpireAvailableNodes: failed to remove old entries: %+v", err) + } +} + +// ExpireNodesHistory cleans up the nodes history and is run by +// the orchestrator active node. +func ExpireNodesHistory() error { + _, err := db.ExecOrchestrator(` + delete + from node_health_history + where + first_seen_active < now() - interval ? hour + `, + config.Config.UnseenInstanceForgetHours, + ) + return log.Errore(err) +} + +func ReadAvailableNodes(onlyHttpNodes bool) (nodes [](*NodeHealth), err error) { + extraInfo := "" + if onlyHttpNodes { + extraInfo = string(OrchestratorExecutionHttpMode) + } + query := ` + select + hostname, token, app_version, first_seen_active, last_seen_active, db_backend + from + node_health + where + last_seen_active > now() - interval ? second + and ? in (extra_info, '') + order by + hostname + ` + + err = db.QueryOrchestrator(query, sqlutils.Args(config.HealthPollSeconds*2, extraInfo), func(m sqlutils.RowMap) error { + nodeHealth := &NodeHealth{ + Hostname: m.GetString("hostname"), + Token: m.GetString("token"), + AppVersion: m.GetString("app_version"), + FirstSeenActive: m.GetString("first_seen_active"), + LastSeenActive: m.GetString("last_seen_active"), + DBBackend: m.GetString("db_backend"), + } + nodes = append(nodes, nodeHealth) + return nil + }) + return nodes, log.Errore(err) +} + +func TokenBelongsToHealthyHttpService(token string) (result bool, err error) { + extraInfo := string(OrchestratorExecutionHttpMode) + + query := ` + select + token + from + node_health + where + and token = ? + and extra_info = ? + ` + + err = db.QueryOrchestrator(query, sqlutils.Args(token, extraInfo), func(m sqlutils.RowMap) error { + // Row exists? We're happy + result = true + return nil + }) + return result, log.Errore(err) +} diff --git a/go/vt/orchestrator/process/host.go b/go/vt/orchestrator/process/host.go new file mode 100644 index 0000000000..8a2cfd7c72 --- /dev/null +++ b/go/vt/orchestrator/process/host.go @@ -0,0 +1,33 @@ +/* + Copyright 2015 Shlomi Noach, courtesy Booking.com + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package process + +import ( + "os" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +var ThisHostname string + +func init() { + var err error + ThisHostname, err = os.Hostname() + if err != nil { + log.Fatalf("Cannot resolve self hostname; required. Aborting. %+v", err) + } +} diff --git a/go/vt/orchestrator/raft/applier.go b/go/vt/orchestrator/raft/applier.go new file mode 100644 index 0000000000..c42254745c --- /dev/null +++ b/go/vt/orchestrator/raft/applier.go @@ -0,0 +1,21 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +type CommandApplier interface { + ApplyCommand(op string, value []byte) interface{} +} diff --git a/go/vt/orchestrator/raft/file_snapshot.go b/go/vt/orchestrator/raft/file_snapshot.go new file mode 100644 index 0000000000..c12cdbdf0c --- /dev/null +++ b/go/vt/orchestrator/raft/file_snapshot.go @@ -0,0 +1,497 @@ +package orcraft + +import ( + "bufio" + "bytes" + "encoding/json" + "fmt" + "hash" + "hash/crc64" + "io" + "io/ioutil" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +const ( + testPath = "permTest" + snapPath = "snapshots" + metaFilePath = "meta.json" + stateFilePath = "state.bin" + tmpSuffix = ".tmp" +) + +// FileSnapshotStore implements the SnapshotStore interface and allows +// snapshots to be made on the local disk. +type FileSnapshotStore struct { + path string + retain int +} + +type snapMetaSlice []*fileSnapshotMeta + +// FileSnapshotSink implements SnapshotSink with a file. +type FileSnapshotSink struct { + store *FileSnapshotStore + dir string + meta fileSnapshotMeta + + stateFile *os.File + stateHash hash.Hash64 + buffered *bufio.Writer + + closed bool +} + +// fileSnapshotMeta is stored on disk. We also put a CRC +// on disk so that we can verify the snapshot. +type fileSnapshotMeta struct { + raft.SnapshotMeta + CRC []byte +} + +// bufferedFile is returned when we open a snapshot. This way +// reads are buffered and the file still gets closed. +type bufferedFile struct { + bh *bufio.Reader + fh *os.File +} + +func (b *bufferedFile) Read(p []byte) (n int, err error) { + return b.bh.Read(p) +} + +func (b *bufferedFile) Close() error { + return b.fh.Close() +} + +// NewFileSnapshotStoreWithLogger creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStoreWithLogger(base string, retain int) (*FileSnapshotStore, error) { + if retain < 1 { + return nil, fmt.Errorf("must retain at least one snapshot") + } + + // Ensure our path exists + path := filepath.Join(base, snapPath) + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return nil, fmt.Errorf("snapshot path not accessible: %v", err) + } + + // Setup the store + store := &FileSnapshotStore{ + path: path, + retain: retain, + } + + // Do a permissions test + if err := store.testPermissions(); err != nil { + return nil, fmt.Errorf("permissions test failed: %v", err) + } + return store, nil +} + +// NewFileSnapshotStore creates a new FileSnapshotStore based +// on a base directory. The `retain` parameter controls how many +// snapshots are retained. Must be at least 1. +func NewFileSnapshotStore(base string, retain int, logOutput io.Writer) (*FileSnapshotStore, error) { + if logOutput == nil { + logOutput = os.Stderr + } + return NewFileSnapshotStoreWithLogger(base, retain) +} + +// testPermissions tries to touch a file in our path to see if it works. +func (f *FileSnapshotStore) testPermissions() error { + path := filepath.Join(f.path, testPath) + fh, err := os.Create(path) + if err != nil { + return err + } + + if err = fh.Close(); err != nil { + return err + } + + if err = os.Remove(path); err != nil { + return err + } + return nil +} + +// snapshotName generates a name for the snapshot. +func snapshotName(term, index uint64) string { + now := time.Now() + msec := now.UnixNano() / int64(time.Millisecond) + return fmt.Sprintf("%d-%d-%d", term, index, msec) +} + +// Create is used to start a new snapshot +func (f *FileSnapshotStore) Create(index, term uint64, peers []byte) (raft.SnapshotSink, error) { + // Create a new path + name := snapshotName(term, index) + path := filepath.Join(f.path, name+tmpSuffix) + log.Infof("snapshot: Creating new snapshot at %s", path) + + // Make the directory + if err := os.MkdirAll(path, 0755); err != nil { + _ = log.Error("snapshot: Failed to make snapshot directory: %v", err) + return nil, err + } + + // Create the sink + sink := &FileSnapshotSink{ + store: f, + dir: path, + meta: fileSnapshotMeta{ + SnapshotMeta: raft.SnapshotMeta{ + ID: name, + Index: index, + Term: term, + Peers: peers, + }, + CRC: nil, + }, + } + + // Write out the meta data + if err := sink.writeMeta(); err != nil { + _ = log.Errorf("snapshot: Failed to write metadata: %v", err) + return nil, err + } + + // Open the state file + statePath := filepath.Join(path, stateFilePath) + fh, err := os.Create(statePath) + if err != nil { + _ = log.Errorf("snapshot: Failed to create state file: %v", err) + return nil, err + } + sink.stateFile = fh + + // Create a CRC64 hash + sink.stateHash = crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Wrap both the hash and file in a MultiWriter with buffering + multi := io.MultiWriter(sink.stateFile, sink.stateHash) + sink.buffered = bufio.NewWriter(multi) + + // Done + return sink, nil +} + +// List returns available snapshots in the store. +func (f *FileSnapshotStore) List() ([]*raft.SnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := f.getSnapshots() + if err != nil { + _ = log.Errorf("snapshot: Failed to get snapshots: %v", err) + return nil, err + } + + var snapMeta []*raft.SnapshotMeta + for _, meta := range snapshots { + snapMeta = append(snapMeta, &meta.SnapshotMeta) + if len(snapMeta) == f.retain { + break + } + } + return snapMeta, nil +} + +// getSnapshots returns all the known snapshots. +func (f *FileSnapshotStore) getSnapshots() ([]*fileSnapshotMeta, error) { + // Get the eligible snapshots + snapshots, err := ioutil.ReadDir(f.path) + if err != nil { + _ = log.Errorf("snapshot: Failed to scan snapshot dir: %v", err) + return nil, err + } + + // Populate the metadata + var snapMeta []*fileSnapshotMeta + for _, snap := range snapshots { + // Ignore any files + if !snap.IsDir() { + continue + } + + // Ignore any temporary snapshots + dirName := snap.Name() + if strings.HasSuffix(dirName, tmpSuffix) { + _ = log.Warningf("snapshot: Found temporary snapshot: %v", dirName) + continue + } + + // Try to read the meta data + meta, err := f.readMeta(dirName) + if err != nil { + _ = log.Warningf("snapshot: Failed to read metadata for %v: %v", dirName, err) + continue + } + + // Append, but only return up to the retain count + snapMeta = append(snapMeta, meta) + } + + // Sort the snapshot, reverse so we get new -> old + sort.Sort(sort.Reverse(snapMetaSlice(snapMeta))) + + return snapMeta, nil +} + +// readMeta is used to read the meta data for a given named backup +func (f *FileSnapshotStore) readMeta(name string) (*fileSnapshotMeta, error) { + // Open the meta file + metaPath := filepath.Join(f.path, name, metaFilePath) + fh, err := os.Open(metaPath) + if err != nil { + return nil, err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewReader(fh) + + // Read in the JSON + meta := &fileSnapshotMeta{} + dec := json.NewDecoder(buffered) + if err := dec.Decode(meta); err != nil { + return nil, err + } + return meta, nil +} + +// Open takes a snapshot ID and returns a ReadCloser for that snapshot. +func (f *FileSnapshotStore) Open(id string) (*raft.SnapshotMeta, io.ReadCloser, error) { + // Get the metadata + meta, err := f.readMeta(id) + if err != nil { + _ = log.Errorf("snapshot: Failed to get meta data to open snapshot: %v", err) + return nil, nil, err + } + + // Open the state file + statePath := filepath.Join(f.path, id, stateFilePath) + fh, err := os.Open(statePath) + if err != nil { + _ = log.Errorf("snapshot: Failed to open state file: %v", err) + return nil, nil, err + } + + // Create a CRC64 hash + stateHash := crc64.New(crc64.MakeTable(crc64.ECMA)) + + // Compute the hash + _, err = io.Copy(stateHash, fh) + if err != nil { + _ = log.Errorf("snapshot: Failed to read state file: %v", err) + fh.Close() + return nil, nil, err + } + + // Verify the hash + computed := stateHash.Sum(nil) + if bytes.Compare(meta.CRC, computed) != 0 { + _ = log.Errorf("snapshot: CRC checksum failed (stored: %v computed: %v)", + meta.CRC, computed) + fh.Close() + return nil, nil, fmt.Errorf("CRC mismatch") + } + + // Seek to the start + if _, err := fh.Seek(0, 0); err != nil { + _ = log.Errorf("snapshot: State file seek failed: %v", err) + fh.Close() + return nil, nil, err + } + + // Return a buffered file + buffered := &bufferedFile{ + bh: bufio.NewReader(fh), + fh: fh, + } + + return &meta.SnapshotMeta, buffered, nil +} + +// ReapSnapshots reaps any snapshots beyond the retain count. +func (f *FileSnapshotStore) ReapSnapshots(currentSnapshotMeta *fileSnapshotMeta) error { + + reapSnapshot := func(snapshot *fileSnapshotMeta) error { + path := filepath.Join(f.path, snapshot.ID) + log.Infof("snapshot: reaping snapshot %v", path) + if err := os.RemoveAll(path); err != nil { + _ = log.Errorf("snapshot: Failed to reap snapshot %v: %v", path, err) + return err + } + return nil + } + snapshots, err := f.getSnapshots() + if err != nil { + _ = log.Errorf("snapshot: Failed to get snapshots: %v", err) + return err + } + + deprecatedSnapshotsReaped := false + for _, snapshot := range snapshots { + if snapshot.Term > currentSnapshotMeta.Term || + snapshot.Term == currentSnapshotMeta.Term && snapshot.Index > currentSnapshotMeta.Index { + reapSnapshot(snapshot) + deprecatedSnapshotsReaped = true + } + } + + if deprecatedSnapshotsReaped { + // re-read list, since we've removed files + snapshots, err = f.getSnapshots() + if err != nil { + _ = log.Errorf("snapshot: Failed to get snapshots: %v", err) + return err + } + } + for i := f.retain; i < len(snapshots); i++ { + reapSnapshot(snapshots[i]) + } + return nil +} + +// ID returns the ID of the snapshot, can be used with Open() +// after the snapshot is finalized. +func (s *FileSnapshotSink) ID() string { + return s.meta.ID +} + +// Write is used to append to the state file. We write to the +// buffered IO object to reduce the amount of context switches. +func (s *FileSnapshotSink) Write(b []byte) (int, error) { + return s.buffered.Write(b) +} + +// Close is used to indicate a successful end. +func (s *FileSnapshotSink) Close() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + _ = log.Errorf("snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Write out the meta data + if err := s.writeMeta(); err != nil { + _ = log.Errorf("snapshot: Failed to write metadata: %v", err) + return err + } + + // Move the directory into place + newPath := strings.TrimSuffix(s.dir, tmpSuffix) + if err := os.Rename(s.dir, newPath); err != nil { + _ = log.Errorf("snapshot: Failed to move snapshot into place: %v", err) + return err + } + + // Reap any old snapshots + if err := s.store.ReapSnapshots(&s.meta); err != nil { + return err + } + + return nil +} + +// Cancel is used to indicate an unsuccessful end. +func (s *FileSnapshotSink) Cancel() error { + // Make sure close is idempotent + if s.closed { + return nil + } + s.closed = true + + // Close the open handles + if err := s.finalize(); err != nil { + _ = log.Errorf("snapshot: Failed to finalize snapshot: %v", err) + return err + } + + // Attempt to remove all artifacts + return os.RemoveAll(s.dir) +} + +// finalize is used to close all of our resources. +func (s *FileSnapshotSink) finalize() error { + // Flush any remaining data + if err := s.buffered.Flush(); err != nil { + return err + } + + // Get the file size + stat, statErr := s.stateFile.Stat() + + // Close the file + if err := s.stateFile.Close(); err != nil { + return err + } + + // Set the file size, check after we close + if statErr != nil { + return statErr + } + s.meta.Size = stat.Size() + + // Set the CRC + s.meta.CRC = s.stateHash.Sum(nil) + return nil +} + +// writeMeta is used to write out the metadata we have. +func (s *FileSnapshotSink) writeMeta() error { + // Open the meta file + metaPath := filepath.Join(s.dir, metaFilePath) + fh, err := os.Create(metaPath) + if err != nil { + return err + } + defer fh.Close() + + // Buffer the file IO + buffered := bufio.NewWriter(fh) + defer buffered.Flush() + + // Write out as JSON + enc := json.NewEncoder(buffered) + if err := enc.Encode(&s.meta); err != nil { + return err + } + return nil +} + +// Implement the sort interface for []*fileSnapshotMeta. +func (s snapMetaSlice) Len() int { + return len(s) +} + +func (s snapMetaSlice) Less(i, j int) bool { + if s[i].Term != s[j].Term { + return s[i].Term < s[j].Term + } + if s[i].Index != s[j].Index { + return s[i].Index < s[j].Index + } + return s[i].ID < s[j].ID +} + +func (s snapMetaSlice) Swap(i, j int) { + s[i], s[j] = s[j], s[i] +} diff --git a/go/vt/orchestrator/raft/fsm.go b/go/vt/orchestrator/raft/fsm.go new file mode 100644 index 0000000000..aac5eb8e29 --- /dev/null +++ b/go/vt/orchestrator/raft/fsm.go @@ -0,0 +1,94 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "encoding/json" + "io" + "strings" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +// fsm is a raft finite state machine +type fsm Store + +// Apply applies a Raft log entry to the key-value store. +func (f *fsm) Apply(l *raft.Log) interface{} { + var c storeCommand + if err := json.Unmarshal(l.Data, &c); err != nil { + log.Errorf("failed to unmarshal command: %s", err.Error()) + } + + if c.Op == YieldCommand { + toPeer, err := normalizeRaftNode(string(c.Value)) + if err != nil { + return log.Errore(err) + } + return f.yield(toPeer) + } + if c.Op == YieldHintCommand { + hint := string(c.Value) + return f.yieldByHint(hint) + } + log.Debugf("orchestrator/raft: applying command %+v: %s", l.Index, c.Op) + return store.applier.ApplyCommand(c.Op, c.Value) +} + +// yield yields to a suggested peer, or does nothing if this peer IS the suggested peer +func (f *fsm) yield(toPeer string) interface{} { + isThisPeer, err := IsPeer(toPeer) + if err != nil { + return log.Errorf("failed to unmarshal command: %s", err.Error()) + } + if isThisPeer { + log.Debugf("Will not yield to myself") + return nil + } + log.Debugf("Yielding to %s", toPeer) + return Yield() +} + +// yieldByHint yields to a host that contains given hint +func (f *fsm) yieldByHint(hint string) interface{} { + if hint == "" { + log.Debugf("Will not yield by empty hint") + return nil + } + isThisHost := strings.Contains(ThisHostname, hint) + if isThisHost { + log.Debugf("Will not yield to myself") + return nil + } + log.Debugf("Yielding to hinted %s", hint) + return Yield() +} + +// Snapshot returns a snapshot object of freno's state +func (f *fsm) Snapshot() (raft.FSMSnapshot, error) { + snapshot := newFsmSnapshot(f.snapshotCreatorApplier) + return snapshot, nil +} + +// Restore restores freno state +func (f *fsm) Restore(rc io.ReadCloser) error { + defer rc.Close() + + return f.snapshotCreatorApplier.Restore(rc) +} diff --git a/go/vt/orchestrator/raft/fsm_snapshot.go b/go/vt/orchestrator/raft/fsm_snapshot.go new file mode 100644 index 0000000000..fcd1e670ff --- /dev/null +++ b/go/vt/orchestrator/raft/fsm_snapshot.go @@ -0,0 +1,48 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +// fsmSnapshot handles raft persisting of snapshots +type fsmSnapshot struct { + snapshotCreatorApplier SnapshotCreatorApplier +} + +func newFsmSnapshot(snapshotCreatorApplier SnapshotCreatorApplier) *fsmSnapshot { + return &fsmSnapshot{ + snapshotCreatorApplier: snapshotCreatorApplier, + } +} + +// Persist +func (f *fsmSnapshot) Persist(sink raft.SnapshotSink) error { + data, err := f.snapshotCreatorApplier.GetData() + if err != nil { + return err + } + if _, err := sink.Write(data); err != nil { + return err + } + return sink.Close() +} + +// Release +func (f *fsmSnapshot) Release() { +} diff --git a/go/vt/orchestrator/raft/http_client.go b/go/vt/orchestrator/raft/http_client.go new file mode 100644 index 0000000000..9e69bb8602 --- /dev/null +++ b/go/vt/orchestrator/raft/http_client.go @@ -0,0 +1,109 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "crypto/tls" + "fmt" + "io/ioutil" + "net" + "net/http" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/ssl" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +var httpClient *http.Client + +func setupHttpClient() error { + httpTimeout := time.Duration(config.ActiveNodeExpireSeconds) * time.Second + dialTimeout := func(network, addr string) (net.Conn, error) { + return net.DialTimeout(network, addr, httpTimeout) + } + + tlsConfig := &tls.Config{ + InsecureSkipVerify: config.Config.SSLSkipVerify, + } + if config.Config.UseSSL { + caPool, err := ssl.ReadCAFile(config.Config.SSLCAFile) + if err != nil { + return err + } + tlsConfig.RootCAs = caPool + + if config.Config.UseMutualTLS { + var sslPEMPassword []byte + if ssl.IsEncryptedPEM(config.Config.SSLPrivateKeyFile) { + sslPEMPassword = ssl.GetPEMPassword(config.Config.SSLPrivateKeyFile) + } + if err := ssl.AppendKeyPairWithPassword(tlsConfig, config.Config.SSLCertFile, config.Config.SSLPrivateKeyFile, sslPEMPassword); err != nil { + return err + } + } + } + + httpTransport := &http.Transport{ + TLSClientConfig: tlsConfig, + Dial: dialTimeout, + ResponseHeaderTimeout: httpTimeout, + } + httpClient = &http.Client{Transport: httpTransport} + + return nil +} + +func HttpGetLeader(path string) (response []byte, err error) { + leaderURI := LeaderURI.Get() + if leaderURI == "" { + return nil, fmt.Errorf("Raft leader URI unknown") + } + leaderAPI := leaderURI + if config.Config.URLPrefix != "" { + // We know URLPrefix begind with "/" + leaderAPI = fmt.Sprintf("%s%s", leaderAPI, config.Config.URLPrefix) + } + leaderAPI = fmt.Sprintf("%s/api", leaderAPI) + + url := fmt.Sprintf("%s/%s", leaderAPI, path) + + req, err := http.NewRequest("GET", url, nil) + switch strings.ToLower(config.Config.AuthenticationMethod) { + case "basic", "multi": + req.SetBasicAuth(config.Config.HTTPAuthUser, config.Config.HTTPAuthPassword) + } + + res, err := httpClient.Do(req) + if err != nil { + return nil, err + } + defer res.Body.Close() + + body, err := ioutil.ReadAll(res.Body) + if err != nil { + return nil, err + } + + if res.StatusCode != http.StatusOK { + return body, log.Errorf("HttpGetLeader: got %d status on %s", res.StatusCode, url) + } + + return body, nil +} diff --git a/go/vt/orchestrator/raft/raft.go b/go/vt/orchestrator/raft/raft.go new file mode 100644 index 0000000000..d68f1b550c --- /dev/null +++ b/go/vt/orchestrator/raft/raft.go @@ -0,0 +1,409 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "encoding/json" + "fmt" + "math/rand" + "net" + "strings" + "sync" + "sync/atomic" + "time" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/util" + + "github.com/patrickmn/go-cache" + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +const ( + YieldCommand = "yield" + YieldHintCommand = "yield-hint" +) + +const ( + retainSnapshotCount = 10 + snapshotInterval = 30 * time.Minute + asyncSnapshotTimeframe = 1 * time.Minute + raftTimeout = 10 * time.Second +) + +var RaftNotRunning error = fmt.Errorf("raft is not configured/running") +var store *Store +var raftSetupComplete int64 +var ThisHostname string +var healthRequestAuthenticationTokenCache = cache.New(config.RaftHealthPollSeconds*2*time.Second, time.Second) +var healthReportsCache = cache.New(config.RaftHealthPollSeconds*2*time.Second, time.Second) +var healthRequestReportCache = cache.New(time.Second, time.Second) + +var fatalRaftErrorChan = make(chan error) + +type leaderURI struct { + uri string + sync.Mutex +} + +var LeaderURI leaderURI +var thisLeaderURI string // How this node identifies itself assuming it is the leader + +func (luri *leaderURI) Get() string { + luri.Lock() + defer luri.Unlock() + return luri.uri +} + +func (luri *leaderURI) Set(uri string) { + luri.Lock() + defer luri.Unlock() + luri.uri = uri +} + +func (luri *leaderURI) IsThisLeaderURI() bool { + luri.Lock() + defer luri.Unlock() + return luri.uri == thisLeaderURI +} + +func IsRaftEnabled() bool { + return store != nil +} + +func FatalRaftError(err error) error { + if err != nil { + go func() { fatalRaftErrorChan <- err }() + } + return err +} + +func computeLeaderURI() (uri string, err error) { + if config.Config.HTTPAdvertise != "" { + // Explicitly given + return config.Config.HTTPAdvertise, nil + } + // Not explicitly given. Let's heuristically compute using RaftAdvertise + scheme := "http" + if config.Config.UseSSL { + scheme = "https" + } + + hostname := strings.Split(config.Config.RaftAdvertise, ":")[0] + listenTokens := strings.Split(config.Config.ListenAddress, ":") + if len(listenTokens) < 2 { + return uri, fmt.Errorf("computeLeaderURI: cannot determine listen port out of config.Config.ListenAddress: %+v", config.Config.ListenAddress) + } + port := listenTokens[1] + + uri = fmt.Sprintf("%s://%s:%s", scheme, hostname, port) + return uri, nil +} + +// Setup creates the entire raft shananga. Creates the store, associates with the throttler, +// contacts peer nodes, and subscribes to leader changes to export them. +func Setup(applier CommandApplier, snapshotCreatorApplier SnapshotCreatorApplier, thisHostname string) error { + log.Debugf("Setting up raft") + ThisHostname = thisHostname + raftBind, err := normalizeRaftNode(config.Config.RaftBind) + if err != nil { + return err + } + raftAdvertise, err := normalizeRaftNode(config.Config.RaftAdvertise) + if err != nil { + return err + } + store = NewStore(config.Config.RaftDataDir, raftBind, raftAdvertise, applier, snapshotCreatorApplier) + peerNodes := []string{} + for _, raftNode := range config.Config.RaftNodes { + peerNode, err := normalizeRaftNode(raftNode) + if err != nil { + return err + } + peerNodes = append(peerNodes, peerNode) + } + if len(peerNodes) == 1 && peerNodes[0] == raftAdvertise { + // To run in single node setup we will either specify an empty RaftNodes, or a single + // raft node that is exactly RaftAdvertise + peerNodes = []string{} + } + if err := store.Open(peerNodes); err != nil { + return log.Errorf("failed to open raft store: %s", err.Error()) + } + + thisLeaderURI, err = computeLeaderURI() + if err != nil { + return FatalRaftError(err) + } + + leaderCh := store.raft.LeaderCh() + go func() { + for isTurnedLeader := range leaderCh { + if isTurnedLeader { + PublishCommand("leader-uri", thisLeaderURI) + } + } + }() + + setupHttpClient() + + atomic.StoreInt64(&raftSetupComplete, 1) + return nil +} + +func isRaftSetupComplete() bool { + return atomic.LoadInt64(&raftSetupComplete) == 1 +} + +// getRaft is a convenience method +func getRaft() *raft.Raft { + return store.raft +} + +func normalizeRaftHostnameIP(host string) (string, error) { + if ip := net.ParseIP(host); ip != nil { + // this is a valid IP address. + return host, nil + } + ips, err := net.LookupIP(host) + if err != nil { + // resolve failed. But we don't want to fail the entire operation for that + log.Errore(err) + return host, nil + } + // resolve success! + for _, ip := range ips { + return ip.String(), nil + } + return host, fmt.Errorf("%+v resolved but no IP found", host) +} + +// normalizeRaftNode attempts to make sure there's a port to the given node. +// It consults the DefaultRaftPort when there isn't +func normalizeRaftNode(node string) (string, error) { + hostPort := strings.Split(node, ":") + host, err := normalizeRaftHostnameIP(hostPort[0]) + if err != nil { + return host, err + } + if len(hostPort) > 1 { + return fmt.Sprintf("%s:%s", host, hostPort[1]), nil + } else if config.Config.DefaultRaftPort != 0 { + // No port specified, add one + return fmt.Sprintf("%s:%d", host, config.Config.DefaultRaftPort), nil + } else { + return host, nil + } +} + +// IsPartOfQuorum returns `true` when this node is part of the raft quorum, meaning its +// data and opinion are trustworthy. +// Comapre that to a node which has left (or has not yet joined) the quorum: it has stale data. +func IsPartOfQuorum() bool { + if GetLeader() == "" { + return false + } + state := GetState() + return state == raft.Leader || state == raft.Follower +} + +// IsLeader tells if this node is the current raft leader +func IsLeader() bool { + return GetState() == raft.Leader +} + +// GetLeader returns identity of raft leader +func GetLeader() string { + if !isRaftSetupComplete() { + return "" + } + return getRaft().Leader() +} + +func QuorumSize() (int, error) { + peers, err := GetPeers() + if err != nil { + return 0, err + } + return len(peers)/2 + 1, nil +} + +// GetState returns current raft state +func GetState() raft.RaftState { + if !isRaftSetupComplete() { + return raft.Candidate + } + return getRaft().State() +} + +// IsHealthy checks whether this node is healthy in the raft group +func IsHealthy() bool { + if !isRaftSetupComplete() { + return false + } + state := GetState() + return state == raft.Leader || state == raft.Follower +} + +func Snapshot() error { + future := getRaft().Snapshot() + return future.Error() +} + +func AsyncSnapshot() error { + asyncDuration := (time.Duration(rand.Int63()) % asyncSnapshotTimeframe) + go time.AfterFunc(asyncDuration, func() { + Snapshot() + }) + return nil +} + +func StepDown() { + getRaft().StepDown() +} + +func Yield() error { + if !IsRaftEnabled() { + return RaftNotRunning + } + return getRaft().Yield() +} + +func GetRaftBind() string { + return store.raftBind +} + +func GetRaftAdvertise() string { + return store.raftAdvertise +} + +func GetPeers() ([]string, error) { + if !IsRaftEnabled() { + return []string{}, RaftNotRunning + } + return store.peerStore.Peers() +} + +func IsPeer(peer string) (bool, error) { + if !IsRaftEnabled() { + return false, RaftNotRunning + } + return (store.raftBind == peer), nil +} + +// PublishCommand will distribute a command across the group +func PublishCommand(op string, value interface{}) (response interface{}, err error) { + if !IsRaftEnabled() { + return nil, RaftNotRunning + } + b, err := json.Marshal(value) + if err != nil { + return nil, err + } + return store.genericCommand(op, b) +} + +func AddPeer(addr string) (response interface{}, err error) { + addr, err = normalizeRaftNode(addr) + if err != nil { + return "", err + } + err = store.AddPeer(addr) + return addr, err +} + +func RemovePeer(addr string) (response interface{}, err error) { + addr, err = normalizeRaftNode(addr) + if err != nil { + return "", err + } + err = store.RemovePeer(addr) + return addr, err +} + +func PublishYield(toPeer string) (response interface{}, err error) { + toPeer, err = normalizeRaftNode(toPeer) + if err != nil { + return "", err + } + return store.genericCommand(YieldCommand, []byte(toPeer)) +} + +func PublishYieldHostnameHint(hostnameHint string) (response interface{}, err error) { + return store.genericCommand(YieldHintCommand, []byte(hostnameHint)) +} + +// ReportToRaftLeader tells the leader this raft node is raft-healthy +func ReportToRaftLeader(authenticationToken string) (err error) { + if err := healthRequestReportCache.Add(config.Config.RaftBind, true, cache.DefaultExpiration); err != nil { + // Recently reported + return nil + } + path := fmt.Sprintf("raft-follower-health-report/%s/%s/%s", authenticationToken, config.Config.RaftBind, config.Config.RaftAdvertise) + _, err = HttpGetLeader(path) + return err +} + +// OnHealthReport acts on a raft-member reporting its health +func OnHealthReport(authenticationToken, raftBind, raftAdvertise string) (err error) { + if _, found := healthRequestAuthenticationTokenCache.Get(authenticationToken); !found { + return log.Errorf("Raft health report: unknown token %s", authenticationToken) + } + healthReportsCache.Set(raftAdvertise, true, cache.DefaultExpiration) + return nil +} + +func HealthyMembers() (advertised []string) { + items := healthReportsCache.Items() + for raftAdvertised := range items { + advertised = append(advertised, raftAdvertised) + } + return advertised +} + +// Monitor is a utility function to routinely observe leadership state. +// It doesn't actually do much; merely takes notes. +func Monitor() { + t := time.Tick(5 * time.Second) + heartbeat := time.Tick(1 * time.Minute) + followerHealthTick := time.Tick(config.RaftHealthPollSeconds * time.Second) + for { + select { + case <-t: + leaderHint := GetLeader() + + if IsLeader() { + leaderHint = fmt.Sprintf("%s (this host)", leaderHint) + } + log.Debugf("raft leader is %s; state: %s", leaderHint, GetState().String()) + + case <-heartbeat: + if IsLeader() { + go PublishCommand("heartbeat", "") + } + case <-followerHealthTick: + if IsLeader() { + athenticationToken := util.NewToken().Short() + healthRequestAuthenticationTokenCache.Set(athenticationToken, true, cache.DefaultExpiration) + go PublishCommand("request-health-report", athenticationToken) + } + case err := <-fatalRaftErrorChan: + log.Fatale(err) + } + } +} diff --git a/go/vt/orchestrator/raft/rel_store.go b/go/vt/orchestrator/raft/rel_store.go new file mode 100644 index 0000000000..0be7e27f1f --- /dev/null +++ b/go/vt/orchestrator/raft/rel_store.go @@ -0,0 +1,249 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "database/sql" + "encoding/binary" + "path/filepath" + "sync" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + "vitess.io/vitess/go/vt/orchestrator/external/golib/sqlutils" + + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +const raftStoreFile = "raft_store.db" + +var createQueries = []string{ + ` + CREATE TABLE IF NOT EXISTS raft_log ( + log_index integer, + term bigint not null, + log_type int not null, + data blob not null, + PRIMARY KEY (log_index) + ) + `, + ` + CREATE TABLE IF NOT EXISTS raft_store ( + store_id integer, + store_key varbinary(512) not null, + store_value blob not null, + PRIMARY KEY (store_id) + ) + `, + ` + CREATE INDEX IF NOT EXISTS store_key_idx_raft_store ON raft_store (store_key) + `, +} + +var dbMutex sync.Mutex + +// RelationalStoreimplements: +// - hashicorp/raft.StableStore +// - hashicorp/log.LogStore +type RelationalStore struct { + dataDir string + backend *sql.DB +} + +func NewRelationalStore(dataDir string) *RelationalStore { + return &RelationalStore{ + dataDir: dataDir, + } +} + +func (relStore *RelationalStore) openDB() (*sql.DB, error) { + dbMutex.Lock() + defer dbMutex.Unlock() + + if relStore.backend == nil { + relStoreFile := filepath.Join(relStore.dataDir, raftStoreFile) + sqliteDB, _, err := sqlutils.GetSQLiteDB(relStoreFile) + if err != nil { + return nil, err + } + sqliteDB.SetMaxOpenConns(1) + sqliteDB.SetMaxIdleConns(1) + for _, query := range createQueries { + if _, err := sqliteDB.Exec(sqlutils.ToSqlite3Dialect(query)); err != nil { + return nil, err + } + } + relStore.backend = sqliteDB + log.Infof("raft: store initialized at %+v", relStoreFile) + } + return relStore.backend, nil +} + +func (relStore *RelationalStore) Set(key []byte, val []byte) error { + db, err := relStore.openDB() + if err != nil { + return err + } + tx, err := db.Begin() + if err != nil { + return err + } + stmt, err := tx.Prepare(`delete from raft_store where store_key = ?`) + if err != nil { + return err + } + _, err = stmt.Exec(key) + if err != nil { + tx.Rollback() + return err + } + stmt, err = tx.Prepare(`insert into raft_store (store_key, store_value) values (?, ?)`) + if err != nil { + tx.Rollback() + return err + } + _, err = stmt.Exec(key, val) + if err != nil { + tx.Rollback() + return err + } + err = tx.Commit() + + return err +} + +// Get returns the value for key, or an empty byte slice if key was not found. +func (relStore *RelationalStore) Get(key []byte) (val []byte, err error) { + db, err := relStore.openDB() + if err != nil { + return val, err + } + err = db.QueryRow("select min(store_value) from raft_store where store_key = ?", key).Scan(&val) + return val, err +} + +func (relStore *RelationalStore) SetUint64(key []byte, val uint64) error { + b := make([]byte, 8) + binary.LittleEndian.PutUint64(b, val) + + return relStore.Set(key, b) +} + +// GetUint64 returns the uint64 value for key, or 0 if key was not found. +func (relStore *RelationalStore) GetUint64(key []byte) (uint64, error) { + b, err := relStore.Get(key) + if err != nil { + return 0, err + } + if len(b) == 0 { + // Not found + return 0, nil + } + i := binary.LittleEndian.Uint64(b) + return i, nil +} + +func (relStore *RelationalStore) FirstIndex() (idx uint64, err error) { + db, err := relStore.openDB() + if err != nil { + return idx, err + } + err = db.QueryRow("select ifnull(min(log_index), 0) from raft_log").Scan(&idx) + return idx, err +} + +// LastIndex returns the last index written. 0 for no entries. +func (relStore *RelationalStore) LastIndex() (idx uint64, err error) { + db, err := relStore.openDB() + if err != nil { + return idx, err + } + err = db.QueryRow("select ifnull(max(log_index), 0) from raft_log").Scan(&idx) + return idx, err +} + +// GetLog gets a log entry at a given index. +func (relStore *RelationalStore) GetLog(index uint64, log *raft.Log) error { + db, err := relStore.openDB() + if err != nil { + return err + } + err = db.QueryRow(` + select log_index, term, log_type, data + from raft_log + where log_index = ? + `, index).Scan(&log.Index, &log.Term, &log.Type, &log.Data) + if err == sql.ErrNoRows { + return raft.ErrLogNotFound + } + return err +} + +// StoreLog stores a log entry. +func (relStore *RelationalStore) StoreLog(log *raft.Log) error { + return relStore.StoreLogs([]*raft.Log{log}) +} + +// StoreLogs stores multiple log entries. +func (relStore *RelationalStore) StoreLogs(logs []*raft.Log) error { + db, err := relStore.openDB() + if err != nil { + return err + } + tx, err := db.Begin() + if err != nil { + return err + } + stmt, err := tx.Prepare(` + replace into raft_log ( + log_index, term, log_type, data + ) values ( + ?, ?, ?, ? + )`) + if err != nil { + return err + } + for _, raftLog := range logs { + _, err = stmt.Exec(raftLog.Index, raftLog.Term, int(raftLog.Type), raftLog.Data) + if err != nil { + tx.Rollback() + return err + } + } + return tx.Commit() +} + +// DeleteRange deletes a range of log entries. The range is inclusive. +func (relStore *RelationalStore) DeleteRange(min, max uint64) error { + db, err := relStore.openDB() + if err != nil { + return err + } + _, err = db.Exec("delete from raft_log where log_index >= ? and log_index <= ?", min, max) + return err +} + +func (relStore *RelationalStore) DeleteAll() error { + firstIndex, err := relStore.FirstIndex() + if err != nil { + return err + } + lastIndex, err := relStore.LastIndex() + if err != nil { + return err + } + return relStore.DeleteRange(firstIndex, lastIndex) +} diff --git a/go/vt/orchestrator/raft/snapshot.go b/go/vt/orchestrator/raft/snapshot.go new file mode 100644 index 0000000000..62a87fc467 --- /dev/null +++ b/go/vt/orchestrator/raft/snapshot.go @@ -0,0 +1,26 @@ +/* + Copyright 2017 Shlomi Noach, GitHub Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package orcraft + +import ( + "io" +) + +type SnapshotCreatorApplier interface { + GetData() (data []byte, err error) + Restore(rc io.ReadCloser) error +} diff --git a/go/vt/orchestrator/raft/store.go b/go/vt/orchestrator/raft/store.go new file mode 100644 index 0000000000..5a8c1ca438 --- /dev/null +++ b/go/vt/orchestrator/raft/store.go @@ -0,0 +1,168 @@ +package orcraft + +import ( + "encoding/json" + "fmt" + "net" + "os" + "strings" + "time" + + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" + + "vitess.io/vitess/go/vt/orchestrator/external/raft" +) + +type Store struct { + raftDir string + raftBind string + raftAdvertise string + + raft *raft.Raft // The consensus mechanism + peerStore raft.PeerStore + + applier CommandApplier + snapshotCreatorApplier SnapshotCreatorApplier +} + +type storeCommand struct { + Op string `json:"op,omitempty"` + Value []byte `json:"value,omitempty"` +} + +// NewStore inits and returns a new store +func NewStore(raftDir string, raftBind string, raftAdvertise string, applier CommandApplier, snapshotCreatorApplier SnapshotCreatorApplier) *Store { + return &Store{ + raftDir: raftDir, + raftBind: raftBind, + raftAdvertise: raftAdvertise, + applier: applier, + snapshotCreatorApplier: snapshotCreatorApplier, + } +} + +// Open opens the store. If enableSingle is set, and there are no existing peers, +// then this node becomes the first node, and therefore leader, of the cluster. +func (store *Store) Open(peerNodes []string) error { + // Setup Raft configuration. + config := raft.DefaultConfig() + config.SnapshotThreshold = 1 + config.SnapshotInterval = snapshotInterval + config.ShutdownOnRemove = false + + // Setup Raft communication. + advertise, err := net.ResolveTCPAddr("tcp", store.raftAdvertise) + if err != nil { + return err + } + log.Debugf("raft: advertise=%+v", advertise) + + transport, err := raft.NewTCPTransport(store.raftBind, advertise, 3, 10*time.Second, os.Stderr) + if err != nil { + return err + } + log.Debugf("raft: transport=%+v", transport) + + peers := make([]string, 0, 10) + for _, peerNode := range peerNodes { + peerNode = strings.TrimSpace(peerNode) + peers = raft.AddUniquePeer(peers, peerNode) + } + log.Debugf("raft: peers=%+v", peers) + + // Create peer storage. + peerStore := &raft.StaticPeers{} + if err := peerStore.SetPeers(peers); err != nil { + return err + } + + // Allow the node to enter single-mode, potentially electing itself, if + // explicitly enabled and there is only 1 node in the cluster already. + if len(peerNodes) == 0 && len(peers) <= 1 { + log.Infof("enabling single-node mode") + config.EnableSingleNode = true + config.DisableBootstrapAfterElect = false + } + + if _, err := os.Stat(store.raftDir); err != nil { + if os.IsNotExist(err) { + // path does not exist + log.Debugf("raft: creating data dir %s", store.raftDir) + if err := os.MkdirAll(store.raftDir, os.ModePerm); err != nil { + return log.Errorf("RaftDataDir (%s) does not exist and cannot be created: %+v", store.raftDir, err) + } + } else { + // Other error + return log.Errorf("RaftDataDir (%s) error: %+v", store.raftDir, err) + } + } + + // Create the snapshot store. This allows the Raft to truncate the log. + snapshots, err := NewFileSnapshotStore(store.raftDir, retainSnapshotCount, os.Stderr) + if err != nil { + return log.Errorf("file snapshot store: %s", err) + } + + // Create the log store and stable store. + logStore := NewRelationalStore(store.raftDir) + log.Debugf("raft: logStore=%+v", logStore) + + // Instantiate the Raft systems. + if store.raft, err = raft.NewRaft(config, (*fsm)(store), logStore, logStore, snapshots, peerStore, transport); err != nil { + return fmt.Errorf("error creating new raft: %s", err) + } + store.peerStore = peerStore + log.Infof("new raft created") + + return nil +} + +// AddPeer adds a node, located at addr, to this store. The node must be ready to +// respond to Raft communications at that address. +func (store *Store) AddPeer(addr string) error { + log.Infof("received join request for remote node %s", addr) + + f := store.raft.AddPeer(addr) + if f.Error() != nil { + return f.Error() + } + log.Infof("node at %s joined successfully", addr) + return nil +} + +// RemovePeer removes a node from this raft setup +func (store *Store) RemovePeer(addr string) error { + log.Infof("received remove request for remote node %s", addr) + + f := store.raft.RemovePeer(addr) + if f.Error() != nil { + return f.Error() + } + log.Infof("node at %s removed successfully", addr) + return nil +} + +// genericCommand requests consensus for applying a single command. +// This is an internal orchestrator implementation +func (store *Store) genericCommand(op string, bytes []byte) (response interface{}, err error) { + if store.raft.State() != raft.Leader { + return nil, fmt.Errorf("not leader") + } + + b, err := json.Marshal(&storeCommand{Op: op, Value: bytes}) + if err != nil { + return nil, err + } + + f := store.raft.Apply(b, raftTimeout) + if err = f.Error(); err != nil { + return nil, err + } + r := f.Response() + if err, ok := r.(error); ok && err != nil { + // This code checks whether the response itself was an error object. If so, it should + // indicate failure of the operation. + return r, err + } + return r, nil +} diff --git a/go/vt/orchestrator/ssl/ssl.go b/go/vt/orchestrator/ssl/ssl.go new file mode 100644 index 0000000000..96a0a06c08 --- /dev/null +++ b/go/vt/orchestrator/ssl/ssl.go @@ -0,0 +1,220 @@ +package ssl + +import ( + "crypto/tls" + "crypto/x509" + "encoding/pem" + "errors" + "fmt" + "io/ioutil" + nethttp "net/http" + "strings" + + "github.com/go-martini/martini" + "github.com/howeyc/gopass" + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/external/golib/log" +) + +var cipherSuites = []uint16{ + tls.TLS_ECDHE_RSA_WITH_AES_128_GCM_SHA256, + tls.TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256, + tls.TLS_ECDHE_RSA_WITH_AES_256_GCM_SHA384, + tls.TLS_ECDHE_ECDSA_WITH_AES_256_GCM_SHA384, + tls.TLS_ECDHE_RSA_WITH_AES_128_CBC_SHA, + tls.TLS_ECDHE_ECDSA_WITH_AES_128_CBC_SHA, + tls.TLS_ECDHE_RSA_WITH_AES_256_CBC_SHA, + tls.TLS_ECDHE_ECDSA_WITH_AES_256_CBC_SHA, + tls.TLS_RSA_WITH_AES_128_CBC_SHA, + tls.TLS_RSA_WITH_AES_256_CBC_SHA, +} + +// Determine if a string element is in a string array +func HasString(elem string, arr []string) bool { + for _, s := range arr { + if s == elem { + return true + } + } + return false +} + +// NewTLSConfig returns an initialized TLS configuration suitable for client +// authentication. If caFile is non-empty, it will be loaded. +func NewTLSConfig(caFile string, verifyCert bool) (*tls.Config, error) { + var c tls.Config + + // Set to TLS 1.2 as a minimum. This is overridden for mysql communication + c.MinVersion = tls.VersionTLS12 + // Remove insecure ciphers from the list + c.CipherSuites = cipherSuites + c.PreferServerCipherSuites = true + + if verifyCert { + log.Info("verifyCert requested, client certificates will be verified") + c.ClientAuth = tls.VerifyClientCertIfGiven + } + caPool, err := ReadCAFile(caFile) + if err != nil { + return &c, err + } + c.ClientCAs = caPool + c.BuildNameToCertificate() + return &c, nil +} + +// Returns CA certificate. If caFile is non-empty, it will be loaded. +func ReadCAFile(caFile string) (*x509.CertPool, error) { + var caCertPool *x509.CertPool + if caFile != "" { + data, err := ioutil.ReadFile(caFile) + if err != nil { + return nil, err + } + caCertPool = x509.NewCertPool() + if !caCertPool.AppendCertsFromPEM(data) { + return nil, errors.New("No certificates parsed") + } + log.Info("Read in CA file:", caFile) + } + return caCertPool, nil +} + +// Verify that the OU of the presented client certificate matches the list +// of Valid OUs +func Verify(r *nethttp.Request, validOUs []string) error { + if strings.Contains(r.URL.String(), config.Config.StatusEndpoint) && !config.Config.StatusOUVerify { + return nil + } + if r.TLS == nil { + return errors.New("No TLS") + } + for _, chain := range r.TLS.VerifiedChains { + s := chain[0].Subject.OrganizationalUnit + log.Debug("All OUs:", strings.Join(s, " ")) + for _, ou := range s { + log.Debug("Client presented OU:", ou) + if HasString(ou, validOUs) { + log.Debug("Found valid OU:", ou) + return nil + } + } + } + log.Error("No valid OUs found") + return errors.New("Invalid OU") +} + +// TODO: make this testable? +func VerifyOUs(validOUs []string) martini.Handler { + return func(res nethttp.ResponseWriter, req *nethttp.Request, c martini.Context) { + log.Debug("Verifying client OU") + if err := Verify(req, validOUs); err != nil { + nethttp.Error(res, err.Error(), nethttp.StatusUnauthorized) + } + } +} + +// AppendKeyPair loads the given TLS key pair and appends it to +// tlsConfig.Certificates. +func AppendKeyPair(tlsConfig *tls.Config, certFile string, keyFile string) error { + cert, err := tls.LoadX509KeyPair(certFile, keyFile) + if err != nil { + return err + } + tlsConfig.Certificates = append(tlsConfig.Certificates, cert) + return nil +} + +// Read in a keypair where the key is password protected +func AppendKeyPairWithPassword(tlsConfig *tls.Config, certFile string, keyFile string, pemPass []byte) error { + + // Certificates aren't usually password protected, but we're kicking the password + // along just in case. It won't be used if the file isn't encrypted + certData, err := ReadPEMData(certFile, pemPass) + if err != nil { + return err + } + keyData, err := ReadPEMData(keyFile, pemPass) + if err != nil { + return err + } + cert, err := tls.X509KeyPair(certData, keyData) + if err != nil { + return err + } + tlsConfig.Certificates = append(tlsConfig.Certificates, cert) + return nil +} + +// Read a PEM file and ask for a password to decrypt it if needed +func ReadPEMData(pemFile string, pemPass []byte) ([]byte, error) { + pemData, err := ioutil.ReadFile(pemFile) + if err != nil { + return pemData, err + } + + // We should really just get the pem.Block back here, if there's other + // junk on the end, warn about it. + pemBlock, rest := pem.Decode(pemData) + if len(rest) > 0 { + log.Warning("Didn't parse all of", pemFile) + } + + if x509.IsEncryptedPEMBlock(pemBlock) { + // Decrypt and get the ASN.1 DER bytes here + pemData, err = x509.DecryptPEMBlock(pemBlock, pemPass) + if err != nil { + return pemData, err + } else { + log.Info("Decrypted", pemFile, "successfully") + } + // Shove the decrypted DER bytes into a new pem Block with blank headers + var newBlock pem.Block + newBlock.Type = pemBlock.Type + newBlock.Bytes = pemData + // This is now like reading in an uncrypted key from a file and stuffing it + // into a byte stream + pemData = pem.EncodeToMemory(&newBlock) + } + return pemData, nil +} + +// Print a password prompt on the terminal and collect a password +func GetPEMPassword(pemFile string) []byte { + fmt.Printf("Password for %s: ", pemFile) + pass, err := gopass.GetPasswd() + if err != nil { + // We'll error with an incorrect password at DecryptPEMBlock + return []byte("") + } + return pass +} + +// Determine if PEM file is encrypted +func IsEncryptedPEM(pemFile string) bool { + pemData, err := ioutil.ReadFile(pemFile) + if err != nil { + return false + } + pemBlock, _ := pem.Decode(pemData) + if len(pemBlock.Bytes) == 0 { + return false + } + return x509.IsEncryptedPEMBlock(pemBlock) +} + +// ListenAndServeTLS acts identically to http.ListenAndServeTLS, except that it +// expects TLS configuration. +// TODO: refactor so this is testable? +func ListenAndServeTLS(addr string, handler nethttp.Handler, tlsConfig *tls.Config) error { + if addr == "" { + // On unix Listen calls getaddrinfo to parse the port, so named ports are fine as long + // as they exist in /etc/services + addr = ":https" + } + l, err := tls.Listen("tcp", addr, tlsConfig) + if err != nil { + return err + } + return nethttp.Serve(l, handler) +} diff --git a/go/vt/orchestrator/ssl/ssl_test.go b/go/vt/orchestrator/ssl/ssl_test.go new file mode 100644 index 0000000000..2d30fc3729 --- /dev/null +++ b/go/vt/orchestrator/ssl/ssl_test.go @@ -0,0 +1,275 @@ +package ssl_test + +import ( + "crypto/tls" + "crypto/x509" + "encoding/pem" + "fmt" + "io/ioutil" + nethttp "net/http" + "reflect" + "strings" + "syscall" + "testing" + + "vitess.io/vitess/go/vt/orchestrator/config" + "vitess.io/vitess/go/vt/orchestrator/ssl" +) + +func TestHasString(t *testing.T) { + elem := "foo" + a1 := []string{"bar", "foo", "baz"} + a2 := []string{"bar", "fuu", "baz"} + good := ssl.HasString(elem, a1) + if !good { + t.Errorf("Didn't find %s in array %s", elem, strings.Join(a1, ", ")) + } + bad := ssl.HasString(elem, a2) + if bad { + t.Errorf("Unexpectedly found %s in array %s", elem, strings.Join(a2, ", ")) + } +} + +// TODO: Build a fake CA and make sure it loads up +func TestNewTLSConfig(t *testing.T) { + fakeCA := writeFakeFile(pemCertificate) + defer syscall.Unlink(fakeCA) + + conf, err := ssl.NewTLSConfig(fakeCA, true) + if err != nil { + t.Errorf("Could not create new TLS config: %s", err) + } + if conf.ClientAuth != tls.VerifyClientCertIfGiven { + t.Errorf("Client certificate verification was not enabled") + } + if conf.ClientCAs == nil { + t.Errorf("ClientCA empty even though cert provided") + } + + conf, err = ssl.NewTLSConfig("", false) + if err != nil { + t.Errorf("Could not create new TLS config: %s", err) + } + if conf.ClientAuth == tls.VerifyClientCertIfGiven { + t.Errorf("Client certificate verification was enabled unexpectedly") + } + if conf.ClientCAs != nil { + t.Errorf("Filling in ClientCA somehow without a cert") + } +} + +func TestStatus(t *testing.T) { + var validOUs []string + url := fmt.Sprintf("http://example.com%s", config.Config.StatusEndpoint) + + req, err := nethttp.NewRequest("GET", url, nil) + if err != nil { + t.Fatal(err) + } + config.Config.StatusOUVerify = false + if err := ssl.Verify(req, validOUs); err != nil { + t.Errorf("Failed even with verification off") + } + config.Config.StatusOUVerify = true + if err := ssl.Verify(req, validOUs); err == nil { + t.Errorf("Did not fail on with bad verification") + } +} + +func TestVerify(t *testing.T) { + var validOUs []string + + req, err := nethttp.NewRequest("GET", "http://example.com/foo", nil) + if err != nil { + t.Fatal(err) + } + + if err := ssl.Verify(req, validOUs); err == nil { + t.Errorf("Did not fail on lack of TLS config") + } + + pemBlock, _ := pem.Decode([]byte(pemCertificate)) + cert, err := x509.ParseCertificate(pemBlock.Bytes) + if err != nil { + t.Fatal(err) + } + + var tcs tls.ConnectionState + req.TLS = &tcs + + if err := ssl.Verify(req, validOUs); err == nil { + t.Errorf("Found a valid OU without any being available") + } + + // Set a fake OU + cert.Subject.OrganizationalUnit = []string{"testing"} + + // Pretend our request had a certificate + req.TLS.PeerCertificates = []*x509.Certificate{cert} + req.TLS.VerifiedChains = [][]*x509.Certificate{req.TLS.PeerCertificates} + + // Look for fake OU + validOUs = []string{"testing"} + + if err := ssl.Verify(req, validOUs); err != nil { + t.Errorf("Failed to verify certificate OU") + } +} + +func TestReadPEMData(t *testing.T) { + pemCertFile := writeFakeFile(pemCertificate) + defer syscall.Unlink(pemCertFile) + pemPKFile := writeFakeFile(pemPrivateKey) + defer syscall.Unlink(pemPKFile) + pemPKWPFile := writeFakeFile(pemPrivateKeyWithPass) + defer syscall.Unlink(pemPKWPFile) + _, err := ssl.ReadPEMData(pemCertFile, []byte{}) + if err != nil { + t.Errorf("Failed to decode certificate: %s", err) + } + pemNoPassBytes, err := ssl.ReadPEMData(pemPKFile, []byte{}) + if err != nil { + t.Errorf("Failed to decode private key: %s", err) + } + pemPassBytes, err := ssl.ReadPEMData(pemPKWPFile, []byte("testing")) + if err != nil { + t.Errorf("Failed to decode private key with password: %s", err) + } + if reflect.DeepEqual(pemPassBytes, pemNoPassBytes) { + t.Errorf("PEM encoding failed after password removal") + } +} + +func TestAppendKeyPair(t *testing.T) { + c, err := ssl.NewTLSConfig("", false) + if err != nil { + t.Fatal(err) + } + pemCertFile := writeFakeFile(pemCertificate) + defer syscall.Unlink(pemCertFile) + pemPKFile := writeFakeFile(pemPrivateKey) + defer syscall.Unlink(pemPKFile) + + if err := ssl.AppendKeyPair(c, pemCertFile, pemPKFile); err != nil { + t.Errorf("Failed to append certificate and key to tls config: %s", err) + } +} + +func TestAppendKeyPairWithPassword(t *testing.T) { + c, err := ssl.NewTLSConfig("", false) + if err != nil { + t.Fatal(err) + } + pemCertFile := writeFakeFile(pemCertificate) + defer syscall.Unlink(pemCertFile) + pemPKFile := writeFakeFile(pemPrivateKeyWithPass) + defer syscall.Unlink(pemPKFile) + + if err := ssl.AppendKeyPairWithPassword(c, pemCertFile, pemPKFile, []byte("testing")); err != nil { + t.Errorf("Failed to append certificate and key to tls config: %s", err) + } +} + +func TestIsEncryptedPEM(t *testing.T) { + pemPKFile := writeFakeFile(pemPrivateKey) + defer syscall.Unlink(pemPKFile) + pemPKWPFile := writeFakeFile(pemPrivateKeyWithPass) + defer syscall.Unlink(pemPKWPFile) + if ssl.IsEncryptedPEM(pemPKFile) { + t.Errorf("Incorrectly identified unencrypted PEM as encrypted") + } + if !ssl.IsEncryptedPEM(pemPKWPFile) { + t.Errorf("Incorrectly identified encrypted PEM as unencrypted") + } +} + +func writeFakeFile(content string) string { + f, err := ioutil.TempFile("", "ssl_test") + if err != nil { + return "" + } + ioutil.WriteFile(f.Name(), []byte(content), 0644) + return f.Name() +} + +const pemCertificate = `-----BEGIN CERTIFICATE----- +MIIDtTCCAp2gAwIBAgIJAOxKC7FsJelrMA0GCSqGSIb3DQEBBQUAMEUxCzAJBgNV +BAYTAkFVMRMwEQYDVQQIEwpTb21lLVN0YXRlMSEwHwYDVQQKExhJbnRlcm5ldCBX +aWRnaXRzIFB0eSBMdGQwHhcNMTcwODEwMTQ0MjM3WhcNMTgwODEwMTQ0MjM3WjBF +MQswCQYDVQQGEwJBVTETMBEGA1UECBMKU29tZS1TdGF0ZTEhMB8GA1UEChMYSW50 +ZXJuZXQgV2lkZ2l0cyBQdHkgTHRkMIIBIjANBgkqhkiG9w0BAQEFAAOCAQ8AMIIB +CgKCAQEA12vHV3gYy5zd1lujA7prEhCSkAszE6E37mViWhLQ63CuedZfyYaTAHQK +HYDZi4K1MNAySUfZRMcICSSsxlRIz6mzXrFsowaJgwx4cbMDIvXE03KstuXoTYJh ++xmXB+5yEVEtIyP2DvPqfCmwCZb3k94Y/VY1nAQDxIxciXrAxT9zT1oYd0YWr2yp +J2mgsfnY4c3zg7W5WgvOTmYz7Ey7GJjpUjGdayx+P1CilKzSWH1xZuVQFNLSHvcH +WXkEoCMVc0tW5mO5eEO1aNHo9MSjPF386l1rq+pz5OwjqCEZq2b1YxesyLnbF+8+ +iYGfYmFaDLFwG7zVDwialuI4TzIIOQIDAQABo4GnMIGkMB0GA1UdDgQWBBQ1ubGx +Yvn3wN5VXyoR0lOD7ARzVTB1BgNVHSMEbjBsgBQ1ubGxYvn3wN5VXyoR0lOD7ARz +VaFJpEcwRTELMAkGA1UEBhMCQVUxEzARBgNVBAgTClNvbWUtU3RhdGUxITAfBgNV +BAoTGEludGVybmV0IFdpZGdpdHMgUHR5IEx0ZIIJAOxKC7FsJelrMAwGA1UdEwQF +MAMBAf8wDQYJKoZIhvcNAQEFBQADggEBALmm4Zw/4jLKDJciUGUYOcr5Xe9TP/Cs +afH7IWvaFUDfV3W6yAm9jgNfIy9aDLpuu2CdEb+0qL2hdmGLV7IM3y62Ve0UTdGV +BGsm1zMmIguew2wGbAwGr5LmIcUseatVUKAAAfDrBNwotEAdM8kmGekUZfOM+J9D +FoNQ62C0buRHGugtu6zWAcZNOe6CI7HdhaAdxZlgn8y7dfJQMacoK0NcWeUVQwii +6D4mgaqUGM2O+WcquD1vEMuBPYVcKhi43019E0+6LI5QB6w80bARY8K7tkTdRD7U +y1/C7iIqyuBVL45OdSabb37TfGlHZIPIwLaGw3i4Mr0+F0jQT8rZtTQ= +-----END CERTIFICATE-----` + +const pemPrivateKey = `-----BEGIN RSA PRIVATE KEY----- +MIIEpAIBAAKCAQEA12vHV3gYy5zd1lujA7prEhCSkAszE6E37mViWhLQ63CuedZf +yYaTAHQKHYDZi4K1MNAySUfZRMcICSSsxlRIz6mzXrFsowaJgwx4cbMDIvXE03Ks +tuXoTYJh+xmXB+5yEVEtIyP2DvPqfCmwCZb3k94Y/VY1nAQDxIxciXrAxT9zT1oY +d0YWr2ypJ2mgsfnY4c3zg7W5WgvOTmYz7Ey7GJjpUjGdayx+P1CilKzSWH1xZuVQ +FNLSHvcHWXkEoCMVc0tW5mO5eEO1aNHo9MSjPF386l1rq+pz5OwjqCEZq2b1Yxes +yLnbF+8+iYGfYmFaDLFwG7zVDwialuI4TzIIOQIDAQABAoIBAHLf4pleTbqmmBWr +IC7oxhgIBmAR2Nbq7eyO2/e0ePxURnZqPwI0ZUekmZBKGbgvp3e0TlyNl+r5R+u4 +RvosD/fNQv2IF6qH3eSoTcIz98Q40xD+4eNWjp5mnOFOMB/mo6VgaHWIw7oNkElN +4bX7b2LG2QSfaE8eRPQW9XHKp+mGhYFbxgPYxUmlIXuYZF61hVwxysDA6DP3LOi8 +yUL6E64x6NqN9xtg/VoN+f6N0MOvsr4yb5+uvni1LVRFI7tNqIN4Y6P6trgKfnRR +EpZeAUu8scqyxE4NeqnnjK/wBuXxaeh3e9mN1V2SzT629c1InmmQasZ5slcCJQB+ +38cswgECgYEA+esaLKwHXT4+sOqMYemi7TrhxtNC2f5OAGUiSRVmTnum2gl4wOB+ +h5oLZAuG5nBEIoqbMEbI35vfuHqIe390IJtPdQlz4TGDsPufYj/gnnBBFy/c8f+n +f/CdRDRYrpnpKGwvUntLRB2pFbe2hlqqq+4YUqiHauJMOCJnPbOo1lECgYEA3KnF +VOXyY0fKD45G7ttfAcpw8ZI2gY99sCRwtBQGsbO61bvw5sl/3j7AmYosz+n6f7hb +uHmitIuPv4z3r1yfVysh80tTGIM3wDkpr3fLYRxpVOZU4hgxMQV9yyaSA/Hfqn48 +vIK/NC4bERqpofNNdrIqNaGWkd87ZycvpRfa0WkCgYBztbVVr4RtWG9gLAg5IRot +KhD0pEWUdpiYuDpqifznI3r6Al6lNot+rwTNGkUoFhyFvZTigjNozFuFpz3fqAAV +RLNCJdFAF1O4spd1vst5r9GDMcbjSJG9u6KkvHO+y0XXUFeMoccUT4NEqd1ZUUsp +9T/PrXWdOA9AAjW4rKDkMQKBgQC9R4NVR8mbD8Frhoeh69qbFqO7E8hdalBN/3QN +hAAZ/imNnSEPVliwsvNSwQufbPzLAcDrhKrkY7JyhOERM0oa44zDvSESLbxszpvL +P97c9hoEEW9OYaIQgr1cvUES0S8ieBZxPVX11HazPUO0/5a68ijyyCD4D5xM53gf +DU9NwQKBgQCmVthQi65xcc4mgCIwXtBZWXeaPv5x0dLEXIC5EoN6eXLK9iW//7cE +hhawtJtl+J6laB+TkEGQsyhc4v85WcywdisyR7LR7CUqFYJMKeE/VtTVKnYbfq54 +rHoQS9YotByBwPtRx0V93gkc+KWBOGmSBBxKj7lrBkYkcWAiRfpJjg== +-----END RSA PRIVATE KEY-----` + +const pemPrivateKeyWithPass = `-----BEGIN RSA PRIVATE KEY----- +Proc-Type: 4,ENCRYPTED +DEK-Info: DES-EDE3-CBC,3EABF60A784F9065 + +IDGYvdRJXvBt5vEDI9caEYJ2vvVmoqmxTKvheNX0aLSXUl/p8hIZ25kd/4mpmI3m +irQdEe2JuNh4/fPDe6Agg6mX6mYCVbiupfXdFKkqJzndW/O5nEQ4yuRgi0fO4wcH +OM/kTS8/7UaKfCuWFa71ywh1WeStFDBwsMQqLdFFeuQ/JC6g2tZW6xzCBE0BVIkq +6OWXmWumXMufhOdpb9sNoc3lbdOi037V886o0cIRQp4qPepElhhhplrhaJZBSxiP +TUldExbtYCN1APhrgUp1RpxIWHNLezjhUYLGooxb6SqinpLd9ia2uFotwNDeX7/T +dMPQPtgdFwvoCtWn9oVWp+regdZPacABLsvtTD4NS8h13BKzBmAqtYfHJk44u/Tv +6PcCb9xHI7+YpNJznrHiCtALWkfG56mDjp0SP+OKjsYMjo317D+x892i2XT79k2T +0IM0OUPizVkN5c7uDQBHqxmE9JVQT7QFMy1P57nWPsmG5o7e9Y/klaPQzi04FWEh +YAEZrU5/FQlFziu3/Jw6WwQnm3IqJP6iMlnR9Y5iZCZQnLhcJNIxxOJ/+cVH4dVD +jIHztasHgbfld045Ua7nk91VyFP5pWRPFacJ74D+xm/1IjF/+9Uj3NQX88Swig0Q +Fi7+eJ1XtCI0YdUqiUdp8QaS1GnFzibSIcXCbLLEn0Cgh/3CFXUyh92M4GIgvmcI +/hi4nUDa3nLYDHyOZubFLERb+Zr3EFzNXX4Ga3fcNH0deluxW4tda+QCk0ud6k9N +y2bCcAVnvbB+yX2s7CSVq+eaT/4JLIJY5AlrISRwYtG57SR/DN9HuU99dD30k581 +PmarIt4VAakjXo/Zqd1AMh+ofbC/Qm7jBwbPGPZAM/FjpnVsvaXsdChI19Az72v3 +wiLOKEw8M23vV4/E7QwW3Pp/RPyUZk6HAlBuLXbcyZHOOV4WPsKrI46BBXL8Qf4X +5kpRITFFUaFu3aaO7mloVAoneEKusKJgKOAwWifRI3jf6fH9B8qDA0jQpWRNpLs4 +3A2qrOyHQ9SMoBr7ya8Vs2BMdfqAmOyiUdVzLr2EjnRxa7f3/7/sdzD1aaIJa2TM +kjpKgFMq5B/FRVmuAvKyEF52A/b6L9EpinyB53DzWnIw9W5zdjjRkuxmGmv1R94A +gJvbONh955cinHft0rm0hdKo77wDvXZdX5ZeITjOwJ0d/VBHYDGUonDVgnAVLcz+ +n1BS+oOS1xLG/EJOGqtNYihVuCkbIwwdAVhc7pKo3nIbLyrKFKFyh/Br11PPBris +nlWo8BWSoFv7gKOftkulHJFAVekisaXe4OIcYMATeLvDfAnBDJrNHZn0HcyHI51L +3EhCCPJrrmfNv+QMdPk6LTts5YIdhNRSV5PR2X8ZshChod7atyrw+Wm+LCcy3h1G +xIVNracpnna+Ic5M8EIJZgLOH7IjDFS1EcPjz5em0rVqGGsLDvxmRo2ZJTPSHlpM +8q6VJEIso5sfoauf+fX+y7xk1CpFG8NkXSplbiYmZXdB1zepV1a/ZiW2uU7hEAV7 +oMEzoBEIw3wTuRasixjH7Z6i8PvF3eUKXCIt0UiwTmWdCCW37c5eqjguyp9aLDtc +-----END RSA PRIVATE KEY-----` diff --git a/go/vt/orchestrator/util/log_cache.go b/go/vt/orchestrator/util/log_cache.go new file mode 100644 index 0000000000..97f58a8b44 --- /dev/null +++ b/go/vt/orchestrator/util/log_cache.go @@ -0,0 +1,30 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package util + +import ( + "fmt" + "time" + + "github.com/patrickmn/go-cache" +) + +var logEntryCache *cache.Cache = cache.New(time.Minute, time.Second*5) + +func ClearToLog(topic string, key string) bool { + return logEntryCache.Add(fmt.Sprintf("%s:%s", topic, key), true, cache.DefaultExpiration) == nil +} diff --git a/go/vt/orchestrator/util/token.go b/go/vt/orchestrator/util/token.go new file mode 100644 index 0000000000..f31f85673e --- /dev/null +++ b/go/vt/orchestrator/util/token.go @@ -0,0 +1,70 @@ +/* + Copyright 2014 Outbrain Inc. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +*/ + +package util + +import ( + "crypto/rand" + "crypto/sha256" + "encoding/hex" + "fmt" + "time" +) + +const ( + shortTokenLength = 8 +) + +func toHash(input []byte) string { + hasher := sha256.New() + hasher.Write(input) + return hex.EncodeToString(hasher.Sum(nil)) +} + +func getRandomData() []byte { + size := 64 + rb := make([]byte, size) + _, _ = rand.Read(rb) + return rb +} + +func RandomHash() string { + return toHash(getRandomData()) +} + +// Token is used to identify and validate requests to this service +type Token struct { + Hash string +} + +func (this *Token) Short() string { + if len(this.Hash) <= shortTokenLength { + return this.Hash + } + return this.Hash[0:shortTokenLength] +} + +var ProcessToken *Token = NewToken() + +func NewToken() *Token { + return &Token{ + Hash: RandomHash(), + } +} + +func PrettyUniqueToken() string { + return fmt.Sprintf("%d:%s", time.Now().UnixNano(), NewToken().Hash) +} diff --git a/go/vt/orchestrator/util/token_test.go b/go/vt/orchestrator/util/token_test.go new file mode 100644 index 0000000000..5dfa1fc084 --- /dev/null +++ b/go/vt/orchestrator/util/token_test.go @@ -0,0 +1,25 @@ +package util + +import ( + "testing" + + test "vitess.io/vitess/go/vt/orchestrator/external/golib/tests" +) + +func init() { +} + +func TestNewToken(t *testing.T) { + token1 := NewToken() + + test.S(t).ExpectNotEquals(token1.Hash, "") + test.S(t).ExpectEquals(len(token1.Hash), 64) +} + +func TestNewTokenRandom(t *testing.T) { + token1 := NewToken() + token2 := NewToken() + + // The following test can fail once in a quadrazillion eons + test.S(t).ExpectNotEquals(token1.Hash, token2.Hash) +}