reconcile: separate recorable and unrecorable error
This commit is contained in:
Родитель
537b3dfd20
Коммит
a43038ee2a
|
@ -98,6 +98,11 @@ func (c *Cluster) send(ev *clusterEvent) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cluster) run() {
|
func (c *Cluster) run() {
|
||||||
|
defer func() {
|
||||||
|
log.Warningf("kiling cluster (%v)", c.name)
|
||||||
|
c.delete()
|
||||||
|
close(c.stopCh)
|
||||||
|
}()
|
||||||
for {
|
for {
|
||||||
select {
|
select {
|
||||||
case event := <-c.eventCh:
|
case event := <-c.eventCh:
|
||||||
|
@ -106,8 +111,6 @@ func (c *Cluster) run() {
|
||||||
log.Printf("update: from: %#v, to: %#v", c.spec, event.spec)
|
log.Printf("update: from: %#v, to: %#v", c.spec, event.spec)
|
||||||
c.spec.Size = event.spec.Size
|
c.spec.Size = event.spec.Size
|
||||||
case eventDeleteCluster:
|
case eventDeleteCluster:
|
||||||
c.delete()
|
|
||||||
close(c.stopCh)
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
case <-time.After(5 * time.Second):
|
case <-time.After(5 * time.Second):
|
||||||
|
@ -124,15 +127,25 @@ func (c *Cluster) run() {
|
||||||
running.Add(&etcdutil.Member{Name: name})
|
running.Add(&etcdutil.Member{Name: name})
|
||||||
}
|
}
|
||||||
if err := c.reconcile(running); err != nil {
|
if err := c.reconcile(running); err != nil {
|
||||||
log.Errorf("fail to reconcile: %v", err)
|
log.Errorf("cluster (%v) fail to reconcile: %v", c.name, err)
|
||||||
if !isErrTransient(err) {
|
if isFatalError(err) {
|
||||||
log.Fatalf("unexpected error from reconciling: %v", err)
|
log.Errorf("cluster (%v) had fatal error: %v", c.name, err)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isFatalError(err error) bool {
|
||||||
|
switch err {
|
||||||
|
case errNoBackupExist:
|
||||||
|
return true
|
||||||
|
default:
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
func (c *Cluster) makeSeedMember() *etcdutil.Member {
|
func (c *Cluster) makeSeedMember() *etcdutil.Member {
|
||||||
etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
||||||
return &etcdutil.Member{Name: etcdName}
|
return &etcdutil.Member{Name: etcdName}
|
||||||
|
@ -141,7 +154,8 @@ func (c *Cluster) makeSeedMember() *etcdutil.Member {
|
||||||
func (c *Cluster) startSeedMember(recoverFromBackup bool) error {
|
func (c *Cluster) startSeedMember(recoverFromBackup bool) error {
|
||||||
m := c.makeSeedMember()
|
m := c.makeSeedMember()
|
||||||
if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil {
|
if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil {
|
||||||
return fmt.Errorf("failed to create seed member (%s): %v", m.Name, err)
|
log.Errorf("failed to create seed member (%s): %v", m.Name, err)
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
c.idCounter++
|
c.idCounter++
|
||||||
log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name)
|
log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name)
|
||||||
|
@ -346,8 +360,11 @@ func (c *Cluster) delete() {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error {
|
func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error {
|
||||||
|
// TODO: remove garbage service. Because we will fail after service created before pods created.
|
||||||
if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil {
|
if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil {
|
||||||
return err
|
if !k8sutil.IsKubernetesResourceAlreadyExistError(err) {
|
||||||
|
return err
|
||||||
|
}
|
||||||
}
|
}
|
||||||
token := ""
|
token := ""
|
||||||
if state == "new" {
|
if state == "new" {
|
||||||
|
@ -361,13 +378,13 @@ func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Me
|
||||||
}
|
}
|
||||||
|
|
||||||
func (c *Cluster) removePodAndService(name string) error {
|
func (c *Cluster) removePodAndService(name string) error {
|
||||||
err := c.kclient.Pods(c.namespace).Delete(name, nil)
|
err := c.kclient.Services(c.namespace).Delete(name)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
err = c.kclient.Services(c.namespace).Delete(name)
|
err = c.kclient.Pods(c.namespace).Delete(name, nil)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
||||||
return err
|
return err
|
||||||
|
@ -384,12 +401,3 @@ func (c *Cluster) pollPods() ([]string, []string, error) {
|
||||||
ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList)
|
ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList)
|
||||||
return ready, unready, nil
|
return ready, unready, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func isErrTransient(err error) bool {
|
|
||||||
switch err {
|
|
||||||
case errTimeoutAddMember, errTimeoutRemoveMember:
|
|
||||||
return true
|
|
||||||
default:
|
|
||||||
return false
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
|
@ -4,21 +4,17 @@ import (
|
||||||
"errors"
|
"errors"
|
||||||
"fmt"
|
"fmt"
|
||||||
"net/http"
|
"net/http"
|
||||||
"time"
|
|
||||||
|
|
||||||
log "github.com/Sirupsen/logrus"
|
log "github.com/Sirupsen/logrus"
|
||||||
"github.com/coreos/etcd/clientv3"
|
"github.com/coreos/etcd/clientv3"
|
||||||
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
|
|
||||||
"github.com/coreos/kube-etcd-controller/pkg/util/constants"
|
"github.com/coreos/kube-etcd-controller/pkg/util/constants"
|
||||||
"github.com/coreos/kube-etcd-controller/pkg/util/etcdutil"
|
"github.com/coreos/kube-etcd-controller/pkg/util/etcdutil"
|
||||||
"github.com/coreos/kube-etcd-controller/pkg/util/k8sutil"
|
"github.com/coreos/kube-etcd-controller/pkg/util/k8sutil"
|
||||||
"golang.org/x/net/context"
|
"golang.org/x/net/context"
|
||||||
"k8s.io/kubernetes/pkg/util/wait"
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
errTimeoutAddMember = errors.New("timeout to add etcd member")
|
errNoBackupExist = errors.New("No backup exist for a disaster recovery")
|
||||||
errTimeoutRemoveMember = errors.New("timeout to remove etcd member")
|
|
||||||
)
|
)
|
||||||
|
|
||||||
// reconcile reconciles
|
// reconcile reconciles
|
||||||
|
@ -110,37 +106,24 @@ func (c *Cluster) addOneMember() error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
defer etcdcli.Close()
|
defer etcdcli.Close()
|
||||||
|
|
||||||
newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
||||||
newMember := &etcdutil.Member{Name: newMemberName}
|
newMember := &etcdutil.Member{Name: newMemberName}
|
||||||
var id uint64
|
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
||||||
// Could have "unhealthy cluster" due to 5 second strict check. Retry.
|
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
|
||||||
err = wait.Poll(2*time.Second, 20*time.Second, func() (done bool, err error) {
|
|
||||||
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
|
||||||
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
|
|
||||||
if err != nil {
|
|
||||||
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
|
|
||||||
return false, nil
|
|
||||||
}
|
|
||||||
return false, fmt.Errorf("etcdcli failed to add one member: %v", err)
|
|
||||||
}
|
|
||||||
id = resp.Member.ID
|
|
||||||
return true, nil
|
|
||||||
})
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
if err == wait.ErrWaitTimeout {
|
|
||||||
err = errTimeoutAddMember
|
|
||||||
}
|
|
||||||
log.Errorf("fail to add new member (%s): %v", newMember.Name, err)
|
log.Errorf("fail to add new member (%s): %v", newMember.Name, err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
newMember.ID = id
|
newMember.ID = resp.Member.ID
|
||||||
c.members.Add(newMember)
|
c.members.Add(newMember)
|
||||||
|
|
||||||
if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil {
|
if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil {
|
||||||
|
log.Errorf("fail to create member (%s): %v", newMember.Name, err)
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
c.idCounter++
|
c.idCounter++
|
||||||
log.Printf("added member, cluster: %s", c.members.PeerURLPairs())
|
log.Printf("added member (%s), cluster (%s)", newMember.Name, c.members.PeerURLPairs())
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -161,10 +144,8 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
|
||||||
|
|
||||||
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
||||||
if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil {
|
if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil {
|
||||||
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
|
log.Errorf("etcdcli failed to remove one member: %v", err)
|
||||||
return errTimeoutRemoveMember
|
return err
|
||||||
}
|
|
||||||
return fmt.Errorf("etcdcli failed to remove one member: %v", err)
|
|
||||||
}
|
}
|
||||||
c.members.Remove(toRemove.Name)
|
c.members.Remove(toRemove.Name)
|
||||||
if err := c.removePodAndService(toRemove.Name); err != nil {
|
if err := c.removePodAndService(toRemove.Name); err != nil {
|
||||||
|
@ -176,9 +157,11 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
|
||||||
|
|
||||||
func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error {
|
func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error {
|
||||||
if c.spec.Backup == nil {
|
if c.spec.Backup == nil {
|
||||||
return fmt.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined."+
|
log.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined.", c.name)
|
||||||
" (TODO: Mark cluster as dead)", c.name)
|
return errNoBackupExist
|
||||||
}
|
}
|
||||||
|
// TODO: We shouldn't return error in backupnow. If backupnow failed, we should ask if it has any backup before.
|
||||||
|
// If so, we can still continue. Otherwise, it's fatal error.
|
||||||
httpClient := c.kclient.RESTClient.Client
|
httpClient := c.kclient.RESTClient.Client
|
||||||
resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name)))
|
resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name)))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
Загрузка…
Ссылка в новой задаче