reconcile: separate recorable and unrecorable error
This commit is contained in:
Родитель
537b3dfd20
Коммит
a43038ee2a
|
@ -98,6 +98,11 @@ func (c *Cluster) send(ev *clusterEvent) {
|
|||
}
|
||||
|
||||
func (c *Cluster) run() {
|
||||
defer func() {
|
||||
log.Warningf("kiling cluster (%v)", c.name)
|
||||
c.delete()
|
||||
close(c.stopCh)
|
||||
}()
|
||||
for {
|
||||
select {
|
||||
case event := <-c.eventCh:
|
||||
|
@ -106,8 +111,6 @@ func (c *Cluster) run() {
|
|||
log.Printf("update: from: %#v, to: %#v", c.spec, event.spec)
|
||||
c.spec.Size = event.spec.Size
|
||||
case eventDeleteCluster:
|
||||
c.delete()
|
||||
close(c.stopCh)
|
||||
return
|
||||
}
|
||||
case <-time.After(5 * time.Second):
|
||||
|
@ -124,15 +127,25 @@ func (c *Cluster) run() {
|
|||
running.Add(&etcdutil.Member{Name: name})
|
||||
}
|
||||
if err := c.reconcile(running); err != nil {
|
||||
log.Errorf("fail to reconcile: %v", err)
|
||||
if !isErrTransient(err) {
|
||||
log.Fatalf("unexpected error from reconciling: %v", err)
|
||||
log.Errorf("cluster (%v) fail to reconcile: %v", c.name, err)
|
||||
if isFatalError(err) {
|
||||
log.Errorf("cluster (%v) had fatal error: %v", c.name, err)
|
||||
return
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func isFatalError(err error) bool {
|
||||
switch err {
|
||||
case errNoBackupExist:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func (c *Cluster) makeSeedMember() *etcdutil.Member {
|
||||
etcdName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
||||
return &etcdutil.Member{Name: etcdName}
|
||||
|
@ -141,7 +154,8 @@ func (c *Cluster) makeSeedMember() *etcdutil.Member {
|
|||
func (c *Cluster) startSeedMember(recoverFromBackup bool) error {
|
||||
m := c.makeSeedMember()
|
||||
if err := c.createPodAndService(etcdutil.NewMemberSet(m), m, "new", recoverFromBackup); err != nil {
|
||||
return fmt.Errorf("failed to create seed member (%s): %v", m.Name, err)
|
||||
log.Errorf("failed to create seed member (%s): %v", m.Name, err)
|
||||
return err
|
||||
}
|
||||
c.idCounter++
|
||||
log.Infof("created cluster (%s) with seed member (%s)", c.name, m.Name)
|
||||
|
@ -346,9 +360,12 @@ func (c *Cluster) delete() {
|
|||
}
|
||||
|
||||
func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Member, state string, needRecovery bool) error {
|
||||
// TODO: remove garbage service. Because we will fail after service created before pods created.
|
||||
if err := k8sutil.CreateEtcdService(c.kclient, m.Name, c.name, c.namespace); err != nil {
|
||||
if !k8sutil.IsKubernetesResourceAlreadyExistError(err) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
token := ""
|
||||
if state == "new" {
|
||||
token = uuid.New()
|
||||
|
@ -361,13 +378,13 @@ func (c *Cluster) createPodAndService(members etcdutil.MemberSet, m *etcdutil.Me
|
|||
}
|
||||
|
||||
func (c *Cluster) removePodAndService(name string) error {
|
||||
err := c.kclient.Pods(c.namespace).Delete(name, nil)
|
||||
err := c.kclient.Services(c.namespace).Delete(name)
|
||||
if err != nil {
|
||||
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
||||
return err
|
||||
}
|
||||
}
|
||||
err = c.kclient.Services(c.namespace).Delete(name)
|
||||
err = c.kclient.Pods(c.namespace).Delete(name, nil)
|
||||
if err != nil {
|
||||
if !k8sutil.IsKubernetesResourceNotFoundError(err) {
|
||||
return err
|
||||
|
@ -384,12 +401,3 @@ func (c *Cluster) pollPods() ([]string, []string, error) {
|
|||
ready, unready := k8sutil.SliceReadyAndUnreadyPods(podList)
|
||||
return ready, unready, nil
|
||||
}
|
||||
|
||||
func isErrTransient(err error) bool {
|
||||
switch err {
|
||||
case errTimeoutAddMember, errTimeoutRemoveMember:
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
|
|
@ -4,21 +4,17 @@ import (
|
|||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
log "github.com/Sirupsen/logrus"
|
||||
"github.com/coreos/etcd/clientv3"
|
||||
"github.com/coreos/etcd/etcdserver/api/v3rpc/rpctypes"
|
||||
"github.com/coreos/kube-etcd-controller/pkg/util/constants"
|
||||
"github.com/coreos/kube-etcd-controller/pkg/util/etcdutil"
|
||||
"github.com/coreos/kube-etcd-controller/pkg/util/k8sutil"
|
||||
"golang.org/x/net/context"
|
||||
"k8s.io/kubernetes/pkg/util/wait"
|
||||
)
|
||||
|
||||
var (
|
||||
errTimeoutAddMember = errors.New("timeout to add etcd member")
|
||||
errTimeoutRemoveMember = errors.New("timeout to remove etcd member")
|
||||
errNoBackupExist = errors.New("No backup exist for a disaster recovery")
|
||||
)
|
||||
|
||||
// reconcile reconciles
|
||||
|
@ -110,37 +106,24 @@ func (c *Cluster) addOneMember() error {
|
|||
return err
|
||||
}
|
||||
defer etcdcli.Close()
|
||||
|
||||
newMemberName := fmt.Sprintf("%s-%04d", c.name, c.idCounter)
|
||||
newMember := &etcdutil.Member{Name: newMemberName}
|
||||
var id uint64
|
||||
// Could have "unhealthy cluster" due to 5 second strict check. Retry.
|
||||
err = wait.Poll(2*time.Second, 20*time.Second, func() (done bool, err error) {
|
||||
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
||||
resp, err := etcdcli.MemberAdd(ctx, []string{newMember.PeerAddr()})
|
||||
if err != nil {
|
||||
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
|
||||
return false, nil
|
||||
}
|
||||
return false, fmt.Errorf("etcdcli failed to add one member: %v", err)
|
||||
}
|
||||
id = resp.Member.ID
|
||||
return true, nil
|
||||
})
|
||||
if err != nil {
|
||||
if err == wait.ErrWaitTimeout {
|
||||
err = errTimeoutAddMember
|
||||
}
|
||||
log.Errorf("fail to add new member (%s): %v", newMember.Name, err)
|
||||
return err
|
||||
}
|
||||
newMember.ID = id
|
||||
newMember.ID = resp.Member.ID
|
||||
c.members.Add(newMember)
|
||||
|
||||
if err := c.createPodAndService(c.members, newMember, "existing", false); err != nil {
|
||||
log.Errorf("fail to create member (%s): %v", newMember.Name, err)
|
||||
return err
|
||||
}
|
||||
c.idCounter++
|
||||
log.Printf("added member, cluster: %s", c.members.PeerURLPairs())
|
||||
log.Printf("added member (%s), cluster (%s)", newMember.Name, c.members.PeerURLPairs())
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -161,10 +144,8 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
|
|||
|
||||
ctx, _ := context.WithTimeout(context.Background(), constants.DefaultRequestTimeout)
|
||||
if _, err := etcdcli.Cluster.MemberRemove(ctx, toRemove.ID); err != nil {
|
||||
if err == rpctypes.ErrUnhealthy || err == context.DeadlineExceeded {
|
||||
return errTimeoutRemoveMember
|
||||
}
|
||||
return fmt.Errorf("etcdcli failed to remove one member: %v", err)
|
||||
log.Errorf("etcdcli failed to remove one member: %v", err)
|
||||
return err
|
||||
}
|
||||
c.members.Remove(toRemove.Name)
|
||||
if err := c.removePodAndService(toRemove.Name); err != nil {
|
||||
|
@ -176,9 +157,11 @@ func (c *Cluster) removeMember(toRemove *etcdutil.Member) error {
|
|||
|
||||
func (c *Cluster) disasterRecovery(left etcdutil.MemberSet) error {
|
||||
if c.spec.Backup == nil {
|
||||
return fmt.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined."+
|
||||
" (TODO: Mark cluster as dead)", c.name)
|
||||
log.Errorf("fail to do disaster recovery for cluster (%s): no backup policy has been defined.", c.name)
|
||||
return errNoBackupExist
|
||||
}
|
||||
// TODO: We shouldn't return error in backupnow. If backupnow failed, we should ask if it has any backup before.
|
||||
// If so, we can still continue. Otherwise, it's fatal error.
|
||||
httpClient := c.kclient.RESTClient.Client
|
||||
resp, err := httpClient.Get(fmt.Sprintf("http://%s/backupnow", k8sutil.MakeBackupHostPort(c.name)))
|
||||
if err != nil {
|
||||
|
|
Загрузка…
Ссылка в новой задаче