reconcile: separate recorable and unrecorable error

This commit is contained in:
Hongchao Deng 2016-10-06 11:10:35 -07:00
Родитель 537b3dfd20
Коммит a43038ee2a
2 изменённых файлов: 39 добавлений и 48 удалений

Просмотреть файл

@ -98,6 +98,11 @@ func (c *Cluster) send(ev *clusterEvent) {
} }
func (c *Cluster) run() { func (c *Cluster) run() {
defer func() {
log.Warningf("kiling cluster (%v)", c.name)
c.delete()
close(c.stopCh)
}()
for { for {
select { select {
case event := <-c.eventCh: case event := <-c.eventCh:
@ -106,8 +111,6 @@ func (c *Cluster) run() {
log.Printf("update: from: %#v, to: %#v", c.spec, event.spec) log.Printf("update: from: %#v, to: %#v", c.spec, event.spec)
c.spec.Size = event.spec.Size c.spec.Size = event.spec.Size
case eventDeleteCluster: case eventDeleteCluster:
c.delete()
close(c.stopCh)
return return
} }
case <-time.After(5 * time.Second): case <-time.After(5 * time.Second):
@ -124,15 +127,25 @@ func (c *Cluster) run() {
running.Add(&etcdutil.Member{Name: name}) running.Add(&etcdutil.Member{Name: name})
} }
if err := c.reconcile(running); err != nil { if err := c.reconcile(running); err != nil {
log.Errorf("fail to reconcile: %v", err) log.Errorf("cluster (%v) fail to reconcile: %v", c.name, err)
if !isErrTransient(err) { if isFatalError(err) {
log.Fatalf("unexpected error from reconciling: %v", err) log.Errorf("cluster (%v) had fatal error: %v", c.name, err)
return
} }
} }
} }
} }
} }
func isFatalError(err error) bool {
switch err {
case errNoBackupExist:
return true
default:
return false
}
}
func (c *Cluster) makeSeedMember() *etcdutil.Member { func (c *Cluster) makeSeedMember() *etcdutil.Member {
etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter) etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
return &etcdutil.Member{Name: etcdName} return &etcdutil.Member{Name: etcdName}
@ -141,7 +154,8 @@ func (c *Cluster) makeSeedMember() *etcdutil.Member {
func (c *Cluster) startSeedMember(recoverFromBackup bool) error { func (c *Cluster) startSeedMember(recoverFromBackup bool) error {
m := c.makeSeedMember() m := c.makeSeedMember()
if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil { if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil {
return fmt.Errorf("failed to create seed member (%s): %v", m.Name, err) log.Errorf("failed to create seed member (%s): %v", m.Name, err)
return err
} }
c.idCounter++ c.idCounter++
log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name) log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name)
@ -346,8 +360,11 @@ func (c *Cluster) delete() {
} }
func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error { func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error {
// TODO: remove garbage service. Because we will fail after service created before pods created.
if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil { if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil {
return err if !k8sutil.IsKubernetesResourceAlreadyExistError(err) {
return err
}
} }
token := "" token := ""
if state == "new" { if state == "new" {
@ -361,13 +378,13 @@ func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Me
} }
func (c *Cluster) removePodAndService(name string) error { func (c *Cluster) removePodAndService(name string) error {
err := c.kclient.Pods(c.namespace).Delete(name, nil) err := c.kclient.Services(c.namespace).Delete(name)
if err != nil { if err != nil {
if !k8sutil.IsKubernetesResourceNotFoundError(err) { if !k8sutil.IsKubernetesResourceNotFoundError(err) {
return err return err
} }
} }
err = c.kclient.Services(c.namespace).Delete(name) err = c.kclient.Pods(c.namespace).Delete(name, nil)
if err != nil { if err != nil {
if !k8sutil.IsKubernetesResourceNotFoundError(err) { if !k8sutil.IsKubernetesResourceNotFoundError(err) {
return err return err
@ -384,12 +401,3 @@ func (c *Cluster) pollPods() ([]string, []string, error) {
ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList) ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList)
return ready, unready, nil return ready, unready, nil
} }
func isErrTransient(err error) bool {
switch err {
case errTimeoutAddMember, errTimeoutRemoveMember:
return true
default:
return false
}
}

Просмотреть файл

@ -4,21 +4,17 @@ import (
"errors" "errors"
"fmt" "fmt"
"net/http" "net/http"
"time"
log "github.com/Sirupsen/logrus" log "github.com/Sirupsen/logrus"
"github.com/coreos/etcd/clientv3" "github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"github.com/coreos/kube-etcd-controller/pkg/util/constants" "github.com/coreos/kube-etcd-controller/pkg/util/constants"
"github.com/coreos/kube-etcd-controller/pkg/util/etcdutil" "github.com/coreos/kube-etcd-controller/pkg/util/etcdutil"
"github.com/coreos/kube-etcd-controller/pkg/util/k8sutil" "github.com/coreos/kube-etcd-controller/pkg/util/k8sutil"
"golang.org/x/net/context" "golang.org/x/net/context"
"k8s.io/kubernetes/pkg/util/wait"
) )
var ( var (
errTimeoutAddMember = errors.New("timeout to add etcd member") errNoBackupExist = errors.New("No backup exist for a disaster recovery")
errTimeoutRemoveMember = errors.New("timeout to remove etcd member")
) )
// reconcile reconciles // reconcile reconciles
@ -110,37 +106,24 @@ func (c *Cluster) addOneMember() error {
return err return err
} }
defer etcdcli.Close() defer etcdcli.Close()
newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter) newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
newMember := &etcdutil.Member{Name: newMemberName} newMember := &etcdutil.Member{Name: newMemberName}
var id uint64 ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
// Could have "unhealthy cluster" due to 5 second strict check. Retry. resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
err = wait.Poll(2*time.Second, 20*time.Second, func() (done bool, err error) {
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
if err != nil {
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
return false, nil
}
return false, fmt.Errorf("etcdcli failed to add one member: %v", err)
}
id = resp.Member.ID
return true, nil
})
if err != nil { if err != nil {
if err == wait.ErrWaitTimeout {
err = errTimeoutAddMember
}
log.Errorf("fail to add new member (%s): %v", newMember.Name, err) log.Errorf("fail to add new member (%s): %v", newMember.Name, err)
return err return err
} }
newMember.ID = id newMember.ID = resp.Member.ID
c.members.Add(newMember) c.members.Add(newMember)
if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil { if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil {
log.Errorf("fail to create member (%s): %v", newMember.Name, err)
return err return err
} }
c.idCounter++ c.idCounter++
log.Printf("added member, cluster: %s", c.members.PeerURLPairs()) log.Printf("added member (%s), cluster (%s)", newMember.Name, c.members.PeerURLPairs())
return nil return nil
} }
@ -161,10 +144,8 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout) ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil { if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil {
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded { log.Errorf("etcdcli failed to remove one member: %v", err)
return errTimeoutRemoveMember return err
}
return fmt.Errorf("etcdcli failed to remove one member: %v", err)
} }
c.members.Remove(toRemove.Name) c.members.Remove(toRemove.Name)
if err := c.removePodAndService(toRemove.Name); err != nil { if err := c.removePodAndService(toRemove.Name); err != nil {
@ -176,9 +157,11 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error { func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error {
if c.spec.Backup == nil { if c.spec.Backup == nil {
return fmt.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined."+ log.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined.", c.name)
" (TODO: Mark cluster as dead)", c.name) return errNoBackupExist
} }
// TODO: We shouldn't return error in backupnow. If backupnow failed, we should ask if it has any backup before.
// If so, we can still continue. Otherwise, it's fatal error.
httpClient := c.kclient.RESTClient.Client httpClient := c.kclient.RESTClient.Client
resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name))) resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name)))
if err != nil { if err != nil {