11 "golang.org/x/net/context"
13 "github.com/Sirupsen/logrus"
14 "github.com/docker/docker/api/types"
15 containertypes "github.com/docker/docker/api/types/container"
16 "github.com/docker/docker/api/types/strslice"
17 "github.com/docker/docker/container"
18 "github.com/docker/docker/daemon/exec"
19 "github.com/docker/docker/restartmanager"
23 // Longest healthcheck probe output message to store. Longer messages will be truncated.
26 // Default interval between probe runs (from the end of the first to the start of the second).
27 // Also the time before the first probe.
28 defaultProbeInterval = 30 * time.Second
30 // The maximum length of time a single probe run should take. If the probe takes longer
31 // than this, the check is considered to have failed.
32 defaultProbeTimeout = 30 * time.Second
34 // The time given for the container to start before the health check starts considering
35 // the container unstable. Defaults to none.
36 defaultStartPeriod = 0 * time.Second
38 // Default number of consecutive failures of the health check
39 // for the container to be considered unhealthy.
40 defaultProbeRetries = 3
42 // Maximum number of entries to record
47 // Exit status codes that can be returned by the probe command.
49 exitStatusHealthy = 0 // Container is healthy
50 exitStatusUnhealthy = 1 // Container is unhealthy
53 // probe implementations know how to run a particular type of probe.
54 type probe interface {
55 // Perform one run of the check. Returns the exit code and an optional
56 // short diagnostic string.
57 run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
60 // cmdProbe implements the "CMD" probe type.
61 type cmdProbe struct {
62 // Run the command with the system's default shell instead of execing it directly.
66 // exec the healthcheck command in the container.
67 // Returns the exit code and probe output (if any)
68 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
69 cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
71 cmdSlice = append(getShell(cntr.Config), cmdSlice...)
73 entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
74 execConfig := exec.NewConfig()
75 execConfig.OpenStdin = false
76 execConfig.OpenStdout = true
77 execConfig.OpenStderr = true
78 execConfig.ContainerID = cntr.ID
79 execConfig.DetachKeys = []byte{}
80 execConfig.Entrypoint = entrypoint
81 execConfig.Args = args
82 execConfig.Tty = false
83 execConfig.Privileged = false
84 execConfig.User = cntr.Config.User
86 linkedEnv, err := d.setupLinkedContainers(cntr)
90 execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
92 d.registerExecCommand(cntr, execConfig)
93 d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
95 output := &limitedBuffer{}
96 err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
100 info, err := d.getExecConfig(execConfig.ID)
104 if info.ExitCode == nil {
105 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", cntr.ID)
107 // Note: Go's json package will handle invalid UTF-8 for us
108 out := output.String()
109 return &types.HealthcheckResult{
111 ExitCode: *info.ExitCode,
116 // Update the container's Status.Health struct based on the latest probe's result.
117 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
121 // probe may have been cancelled while waiting on lock. Ignore result then
128 retries := c.Config.Healthcheck.Retries
130 retries = defaultProbeRetries
134 oldStatus := h.Status
136 if len(h.Log) >= maxLogEntries {
137 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
139 h.Log = append(h.Log, result)
142 if result.ExitCode == exitStatusHealthy {
144 h.Status = types.Healthy
145 } else { // Failure (including invalid exit code)
146 shouldIncrementStreak := true
148 // If the container is starting (i.e. we never had a successful health check)
149 // then we check if we are within the start period of the container in which
150 // case we do not increment the failure streak.
151 if h.Status == types.Starting {
152 startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
153 timeSinceStart := result.Start.Sub(c.State.StartedAt)
155 // If still within the start period, then don't increment failing streak.
156 if timeSinceStart < startPeriod {
157 shouldIncrementStreak = false
161 if shouldIncrementStreak {
164 if h.FailingStreak >= retries {
165 h.Status = types.Unhealthy
168 // Else we're starting or healthy. Stay in that state.
171 // replicate Health status changes
172 if err := c.CheckpointTo(d.containersReplica); err != nil {
173 // queries will be inconsistent until the next probe runs or other state mutations
174 // checkpoint the container
175 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
178 if oldStatus != h.Status {
179 d.LogContainerEvent(c, "health_status: "+h.Status)
181 if d.HasExperimental() && h.Status == types.Unhealthy {
182 restart, wait, err := c.RestartManager().ShouldRestart(0, false, time.Since(c.StartedAt), c.Health.Health)
183 if err == nil && restart {
184 logrus.Infof("Unhealthy container %v: restarting...", c.ID)
188 d.stopHealthchecks(c)
189 if err := d.containerRestart(c, c.StopTimeout()); err != nil {
190 logrus.Debugf("failed to restart container: %+v", err)
192 } else if err != restartmanager.ErrRestartCanceled {
193 logrus.Errorf("restartmanger wait error: %+v", err)
201 // Run the container's monitoring thread until notified via "stop".
202 // There is never more than one monitor thread running per container at a time.
203 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
204 probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
205 probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
209 logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
211 case <-time.After(probeInterval):
212 logrus.Debugf("Running health check for container %s ...", c.ID)
213 startTime := time.Now()
214 ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
215 results := make(chan *types.HealthcheckResult, 1)
217 healthChecksCounter.Inc()
218 result, err := probe.run(ctx, d, c)
220 healthChecksFailedCounter.Inc()
221 logrus.Warnf("Health check for container %s error: %v", c.ID, err)
222 results <- &types.HealthcheckResult{
229 result.Start = startTime
230 logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
237 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
239 // Wait for probe to exit (it might take a while to respond to the TERM
240 // signal and we don't want dying probes to pile up).
243 case result := <-results:
244 handleProbeResult(d, c, result, stop)
248 logrus.Debugf("Health check for container %s taking too long", c.ID)
249 handleProbeResult(d, c, &types.HealthcheckResult{
251 Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
256 // Wait for probe to exit (it might take a while to respond to the TERM
257 // signal and we don't want dying probes to pile up).
264 // Get a suitable probe implementation for the container's healthcheck configuration.
265 // Nil will be returned if no healthcheck was configured or NONE was set.
266 func getProbe(c *container.Container) probe {
267 config := c.Config.Healthcheck
268 if config == nil || len(config.Test) == 0 {
271 switch config.Test[0] {
273 return &cmdProbe{shell: false}
275 return &cmdProbe{shell: true}
277 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
282 // Ensure the health-check monitor is running or not, depending on the current
283 // state of the container.
284 // Called from monitor.go, with c locked.
285 func (d *Daemon) updateHealthMonitor(c *container.Container) {
288 return // No healthcheck configured
292 wantRunning := c.Running && !c.Paused && probe != nil
294 if stop := h.OpenMonitorChannel(); stop != nil {
295 go monitor(d, c, stop, probe)
298 h.CloseMonitorChannel()
302 // Reset the health state for a newly-started, restarted or restored container.
303 // initHealthMonitor is called from monitor.go and we should never be running
304 // two instances at once.
305 // Called with c locked.
306 func (d *Daemon) initHealthMonitor(c *container.Container) {
307 // If no healthcheck is setup then don't init the monitor
308 if getProbe(c) == nil {
312 // This is needed in case we're auto-restarting
313 d.stopHealthchecks(c)
315 if h := c.State.Health; h != nil {
316 h.Status = types.Starting
319 h := &container.Health{}
320 h.Status = types.Starting
324 d.updateHealthMonitor(c)
327 // Called when the container is being stopped (whether because the health check is
328 // failing or for any other reason).
329 func (d *Daemon) stopHealthchecks(c *container.Container) {
332 h.CloseMonitorChannel()
336 // Buffer up to maxOutputLen bytes. Further data is discarded.
337 type limitedBuffer struct {
340 truncated bool // indicates that data has been lost
343 // Append to limitedBuffer while there is room.
344 func (b *limitedBuffer) Write(data []byte) (int, error) {
348 bufLen := b.buf.Len()
350 keep := min(maxOutputLen-bufLen, dataLen)
352 b.buf.Write(data[:keep])
360 // The contents of the buffer, with "..." appended if it overflowed.
361 func (b *limitedBuffer) String() string {
365 out := b.buf.String()
372 // If configuredValue is zero, use defaultValue instead.
373 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
374 if configuredValue == 0 {
377 return configuredValue
380 func min(x, y int) int {
387 func getShell(config *containertypes.Config) []string {
388 if len(config.Shell) != 0 {
391 if runtime.GOOS != "windows" {
392 return []string{"/bin/sh", "-c"}
394 return []string{"cmd", "/S", "/C"}