reconcile: separate recorable and unrecorable error

This commit is contained in:
Hongchao Deng 2016-10-06 11:10:35 -07:00
Родитель 537b3dfd20
Коммит a43038ee2a
2 изменённых файлов: 39 добавлений и 48 удалений

Просмотреть файл

@ -98,6 +98,11 @@ func (c *Cluster) send(ev *clusterEvent) {
}
func (c *Cluster) run() {
defer func() {
log.Warningf("kiling cluster (%v)", c.name)
c.delete()
close(c.stopCh)
}()
for {
select {
case event := <-c.eventCh:
@ -106,8 +111,6 @@ func (c *Cluster) run() {
log.Printf("update: from: %#v, to: %#v", c.spec, event.spec)
c.spec.Size = event.spec.Size
case eventDeleteCluster:
c.delete()
close(c.stopCh)
return
}
case <-time.After(5 * time.Second):
@ -124,15 +127,25 @@ func (c *Cluster) run() {
running.Add(&etcdutil.Member{Name: name})
}
if err := c.reconcile(running); err != nil {
log.Errorf("fail to reconcile: %v", err)
if !isErrTransient(err) {
log.Fatalf("unexpected error from reconciling: %v", err)
log.Errorf("cluster (%v) fail to reconcile: %v", c.name, err)
if isFatalError(err) {
log.Errorf("cluster (%v) had fatal error: %v", c.name, err)
return
}
}
}
}
}
func isFatalError(err error) bool {
switch err {
case errNoBackupExist:
return true
default:
return false
}
}
func (c *Cluster) makeSeedMember() *etcdutil.Member {
etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
return &etcdutil.Member{Name: etcdName}
@ -141,7 +154,8 @@ func (c *Cluster) makeSeedMember() *etcdutil.Member {
func (c *Cluster) startSeedMember(recoverFromBackup bool) error {
m := c.makeSeedMember()
if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil {
return fmt.Errorf("failed to create seed member (%s): %v", m.Name, err)
log.Errorf("failed to create seed member (%s): %v", m.Name, err)
return err
}
c.idCounter++
log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name)
@ -346,8 +360,11 @@ func (c *Cluster) delete() {
}
func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error {
// TODO: remove garbage service. Because we will fail after service created before pods created.
if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil {
return err
if !k8sutil.IsKubernetesResourceAlreadyExistError(err) {
return err
}
}
token := ""
if state == "new" {
@ -361,13 +378,13 @@ func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Me
}
func (c *Cluster) removePodAndService(name string) error {
err := c.kclient.Pods(c.namespace).Delete(name, nil)
err := c.kclient.Services(c.namespace).Delete(name)
if err != nil {
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
return err
}
}
err = c.kclient.Services(c.namespace).Delete(name)
err = c.kclient.Pods(c.namespace).Delete(name, nil)
if err != nil {
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
return err
@ -384,12 +401,3 @@ func (c *Cluster) pollPods() ([]string, []string, error) {
ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList)
return ready, unready, nil
}
func isErrTransient(err error) bool {
switch err {
case errTimeoutAddMember, errTimeoutRemoveMember:
return true
default:
return false
}
}

Просмотреть файл

@ -4,21 +4,17 @@ import (
"errors"
"fmt"
"net/http"
"time"
log "github.com/Sirupsen/logrus"
"github.com/coreos/etcd/clientv3"
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
"github.com/coreos/kube-etcd-controller/pkg/util/constants"
"github.com/coreos/kube-etcd-controller/pkg/util/etcdutil"
"github.com/coreos/kube-etcd-controller/pkg/util/k8sutil"
"golang.org/x/net/context"
"k8s.io/kubernetes/pkg/util/wait"
)
var (
errTimeoutAddMember = errors.New("timeout to add etcd member")
errTimeoutRemoveMember = errors.New("timeout to remove etcd member")
errNoBackupExist = errors.New("No backup exist for a disaster recovery")
)
// reconcile reconciles
@ -110,37 +106,24 @@ func (c *Cluster) addOneMember() error {
return err
}
defer etcdcli.Close()
newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
newMember := &etcdutil.Member{Name: newMemberName}
var id uint64
// Could have "unhealthy cluster" due to 5 second strict check. Retry.
err = wait.Poll(2*time.Second, 20*time.Second, func() (done bool, err error) {
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
if err != nil {
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
return false, nil
}
return false, fmt.Errorf("etcdcli failed to add one member: %v", err)
}
id = resp.Member.ID
return true, nil
})
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
if err != nil {
if err == wait.ErrWaitTimeout {
err = errTimeoutAddMember
}
log.Errorf("fail to add new member (%s): %v", newMember.Name, err)
return err
}
newMember.ID = id
newMember.ID = resp.Member.ID
c.members.Add(newMember)
if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil {
log.Errorf("fail to create member (%s): %v", newMember.Name, err)
return err
}
c.idCounter++
log.Printf("added member, cluster: %s", c.members.PeerURLPairs())
log.Printf("added member (%s), cluster (%s)", newMember.Name, c.members.PeerURLPairs())
return nil
}
@ -161,10 +144,8 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil {
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
return errTimeoutRemoveMember
}
return fmt.Errorf("etcdcli failed to remove one member: %v", err)
log.Errorf("etcdcli failed to remove one member: %v", err)
return err
}
c.members.Remove(toRemove.Name)
if err := c.removePodAndService(toRemove.Name); err != nil {
@ -176,9 +157,11 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error {
if c.spec.Backup == nil {
return fmt.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined."+
" (TODO: Mark cluster as dead)", c.name)
log.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined.", c.name)
return errNoBackupExist
}
// TODO: We shouldn't return error in backupnow. If backupnow failed, we should ask if it has any backup before.
// If so, we can still continue. Otherwise, it's fatal error.
httpClient := c.kclient.RESTClient.Client
resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name)))
if err != nil {