Add documentation and update restart rules.

Implement time backed backoff for restarting and fix failure count when the maximum is 0 Signed-off-by: Michael Crosby <michael@docker.com>
2014-08-04 18:20:53 -07:00 · 2014-08-04 18:20:53 -07:00 · 860c13b788
--- a/daemon/monitor.go
+++ b/daemon/monitor.go
@ -11,6 +11,8 @@ import (
 	"github.com/docker/docker/utils"
 )

+const defaultTimeIncrement = 100
+
 // containerMonitor monitors the execution of a container's main process.
 // If a restart policy is specified for the cotnainer the monitor will ensure that the
 // process is restarted based on the rules of the policy.  When the container is finally stopped
@ -19,16 +21,30 @@ import (
 type containerMonitor struct {
 	mux sync.Mutex

-	container     *Container
+	// container is the container being monitored
+	container *Container
+
+	// restartPolicy is the being applied to the container monitor
 	restartPolicy runconfig.RestartPolicy
-	failureCount  int
-	shouldStop    bool
+
+	// failureCount is the number of times the container has failed to
+	// start in a row
+	failureCount int
+
+	// shouldStop signals the monitor that the next time the container exits it is
+	// either because docker or the user asked for the container to be stopped
+	shouldStop bool
+
+	// timeIncrement is the amount of time to wait between restarts
+	// this is in milliseconds
+	timeIncrement int
 }

 func newContainerMonitor(container *Container, policy runconfig.RestartPolicy) *containerMonitor {
 	return &containerMonitor{
 		container:     container,
 		restartPolicy: policy,
+		timeIncrement: defaultTimeIncrement,
 	}
 }

@ -62,7 +78,7 @@ func (m *containerMonitor) Close() error {

 // reset resets the container's IO and ensures that the command is able to be executed again
 // by copying the data into a new struct
-func (m *containerMonitor) reset() {
+func (m *containerMonitor) reset(successful bool) {
 	container := m.container

 	if container.Config.OpenStdin {
@ -107,14 +123,29 @@ func (m *containerMonitor) reset() {
 		Dir:         c.Dir,
 		SysProcAttr: c.SysProcAttr,
 	}
+
+	// the container exited successfully so we need to reset the failure counter
+	// and the timeIncrement back to the default values
+	if successful {
+		m.failureCount = 0
+		m.timeIncrement = defaultTimeIncrement
+	} else {
+		// otherwise we need to increment the amount of time we wait before restarting
+		// the process.  We will build up by multiplying the increment by 2
+
+		m.failureCount++
+		m.timeIncrement *= 2
+	}
 }

 // Start starts the containers process and monitors it according to the restart policy
 func (m *containerMonitor) Start() error {
 	var (
-		err      error
-		exitCode int
+		err        error
+		exitStatus int
 	)
+
+	// ensure that when the monitor finally exits we release the networking and unmount the rootfs
 	defer m.Close()

 	// reset the restart count
@ -122,31 +153,26 @@ func (m *containerMonitor) Start() error {

 	for !m.shouldStop {
 		m.container.RestartCount++
+
 		if err := m.container.startLoggingToDisk(); err != nil {
-			m.reset()
+			m.reset(false)

 			return err
 		}

 		pipes := execdriver.NewPipes(m.container.stdin, m.container.stdout, m.container.stderr, m.container.Config.OpenStdin)

-		if exitCode, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil {
-			m.failureCount++
-
-			if m.failureCount == m.restartPolicy.MaximumRetryCount {
-				m.ExitOnNext()
-			}
-
+		if exitStatus, err = m.container.daemon.Run(m.container, pipes, m.callback); err != nil {
 			utils.Errorf("Error running container: %s", err)
 		}

 		// We still wait to set the state as stopped and ensure that the locks were released
-		m.container.State.SetStopped(exitCode)
+		m.container.State.SetStopped(exitStatus)

-		m.reset()
+		m.reset(err == nil && exitStatus == 0)

-		if m.shouldRestart(exitCode) {
-			time.Sleep(1 * time.Second)
+		if m.shouldRestart(exitStatus) {
+			time.Sleep(time.Duration(m.timeIncrement) * time.Millisecond)

 			continue
 		}
@ -157,16 +183,31 @@ func (m *containerMonitor) Start() error {
 	return err
 }

-func (m *containerMonitor) shouldRestart(exitCode int) bool {
+// shouldRestart checks the restart policy and applies the rules to determine if
+// the container's process should be restarted
+func (m *containerMonitor) shouldRestart(exitStatus int) bool {
 	m.mux.Lock()
+	defer m.mux.Unlock()

-	shouldRestart := (m.restartPolicy.Name == "always" ||
-		(m.restartPolicy.Name == "on-failure" && exitCode != 0)) &&
-		!m.shouldStop
+	// do not restart if the user or docker has requested that this container be stopped
+	if m.shouldStop {
+		return false
+	}

-	m.mux.Unlock()
+	switch m.restartPolicy.Name {
+	case "always":
+		return true
+	case "on-failure":
+		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
+		if max := m.restartPolicy.MaximumRetryCount; max != 0 && m.failureCount >= max {
+			utils.Debugf("stopping restart of container %s because maximum failure could of %d has been reached", max)
+			return false
+		}

-	return shouldRestart
+		return exitStatus != 0
+	}
+
+	return false
 }

 // callback ensures that the container's state is properly updated after we
--- a/docs/sources/reference/commandline/cli.md
+++ b/docs/sources/reference/commandline/cli.md
@ -993,6 +993,7 @@ removed before the image is removed.
                                   format: ip:hostPort:containerPort | ip::containerPort | hostPort:containerPort
                                   (use 'docker port' to see the actual mapping)
      --privileged=false         Give extended privileges to this container
+      --restart=""               Restart policy to apply when a container exits (no, on-failure, always)
      --rm=false                 Automatically remove the container when it exits (incompatible with -d)
      --sig-proxy=true           Proxy received signals to the process (even in non-TTY mode). SIGCHLD, SIGSTOP, and SIGKILL are not proxied.
      -t, --tty=false            Allocate a pseudo-TTY
@ -1220,6 +1221,31 @@ application change:
   `--rm` option means that when the container exits, the container's layer is
   removed.

+#### Restart Policies
+
+Using the `--restart` flag on docker run you can specify a restart policy for 
+how a container should or should not be restarted on exit.
+
+** no ** - Do not restart the container when it exits.
+
+** on-failure ** - Restart the container only if it exits with a non zero exit status.
+
+** always ** - Always restart the container reguardless of the exit status.
+
+You can also specify the maximum amount of times docker will try to restart the 
+container when using the ** on-failure ** policy.  The default is that docker will try forever to restart the container.
+
+    $ sudo docker run --restart=always redis
+
+This will run the redis container with a restart policy of ** always ** so that if 
+the container exits, docker will restart it.
+
+    $ sudo docker run --restart=on-failure:10 redis
+
+This will run the redis container with a restart policy of ** on-failure ** and a 
+maximum restart count of 10.  If the redis container exits with a non-zero exit 
+status more than 10 times in a row docker will abort trying to restart the container.
+
 ## save

    Usage: docker save IMAGE
--- a/runconfig/parse.go
+++ b/runconfig/parse.go
@ -17,11 +17,12 @@ import (
 )

 var (
-	ErrInvalidWorkingDirectory     = fmt.Errorf("The working directory is invalid. It needs to be an absolute path.")
-	ErrConflictAttachDetach        = fmt.Errorf("Conflicting options: -a and -d")
-	ErrConflictDetachAutoRemove    = fmt.Errorf("Conflicting options: --rm and -d")
-	ErrConflictNetworkHostname     = fmt.Errorf("Conflicting options: -h and the network mode (--net)")
-	ErrConflictHostNetworkAndLinks = fmt.Errorf("Conflicting options: --net=host can't be used with links. This would result in undefined behavior.")
+	ErrInvalidWorkingDirectory            = fmt.Errorf("The working directory is invalid. It needs to be an absolute path.")
+	ErrConflictAttachDetach               = fmt.Errorf("Conflicting options: -a and -d")
+	ErrConflictDetachAutoRemove           = fmt.Errorf("Conflicting options: --rm and -d")
+	ErrConflictNetworkHostname            = fmt.Errorf("Conflicting options: -h and the network mode (--net)")
+	ErrConflictHostNetworkAndLinks        = fmt.Errorf("Conflicting options: --net=host can't be used with links. This would result in undefined behavior.")
+	ErrConflictRestartPolicyAndAutoRemove = fmt.Errorf("Conflicting options: --restart and --rm")
 )

 //FIXME Only used in tests
@ -72,7 +73,7 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf
 		flCpuShares       = cmd.Int64([]string{"c", "-cpu-shares"}, 0, "CPU shares (relative weight)")
 		flCpuset          = cmd.String([]string{"-cpuset"}, "", "CPUs in which to allow execution (0-3, 0,1)")
 		flNetMode         = cmd.String([]string{"-net"}, "bridge", "Set the Network mode for the container\n'bridge': creates a new network stack for the container on the docker bridge\n'none': no networking for this container\n'container:<name|id>': reuses another container network stack\n'host': use the host network stack inside the container.  Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.")
-		flRestartPolicy   = cmd.String([]string{"-restart"}, "", "Restart policy when the dies")
+		flRestartPolicy   = cmd.String([]string{"-restart"}, "", "Restart policy to apply when a container exits (no, on-failure, always)")
 		// For documentation purpose
 		_ = cmd.Bool([]string{"#sig-proxy", "-sig-proxy"}, true, "Proxy received signals to the process (even in non-TTY mode). SIGCHLD, SIGSTOP, and SIGKILL are not proxied.")
 		_ = cmd.String([]string{"#name", "-name"}, "", "Assign a name to the container")
@ -227,8 +228,6 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf
 	}
 	// parse the '-e' and '--env' after, to allow override
 	envVariables = append(envVariables, flEnv.GetAll()...)
-	// boo, there's no debug output for docker run
-	//log.Debugf("Environment variables for the container: %#v", envVariables)

 	netMode, err := parseNetMode(*flNetMode)
 	if err != nil {
@ -240,6 +239,10 @@ func parseRun(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Conf
 		return nil, nil, cmd, err
 	}

+	if *flAutoRemove && (restartPolicy.Name == "always" || restartPolicy.Name == "on-failure") {
+		return nil, nil, cmd, ErrConflictRestartPolicyAndAutoRemove
+	}
+
 	config := &Config{
 		Hostname:        hostname,
 		Domainname:      domainname,
@ -307,7 +310,15 @@ func parseRestartPolicy(policy string) (RestartPolicy, error) {
 	)

 	switch name {
-	case "no", "on-failure", "always":
+	case "always":
+		p.Name = name
+
+		if len(parts) == 2 {
+			return p, fmt.Errorf("maximum restart count not valid with restart policy of \"always\"")
+		}
+	case "no":
+		// do nothing
+	case "on-failure":
 		p.Name = name

 		if len(parts) == 2 {