daemon/health.go

   1 package daemon
   2
   3 import (
   4         "bytes"
   5         "fmt"
   6         "runtime"
   7         "strings"
   8         "sync"
   9         "time"
  10
  11         "golang.org/x/net/context"
  12
  13         "github.com/Sirupsen/logrus"
  14         "github.com/docker/docker/api/types"
  15         containertypes "github.com/docker/docker/api/types/container"
  16         "github.com/docker/docker/api/types/strslice"
  17         "github.com/docker/docker/container"
  18         "github.com/docker/docker/daemon/exec"
  19         "github.com/docker/docker/restartmanager"
  20 )
  21
  22 const (
  23         // Longest healthcheck probe output message to store. Longer messages will be truncated.
  24         maxOutputLen = 4096
  25
  26         // Default interval between probe runs (from the end of the first to the start of the second).
  27         // Also the time before the first probe.
  28         defaultProbeInterval = 30 * time.Second
  29
  30         // The maximum length of time a single probe run should take. If the probe takes longer
  31         // than this, the check is considered to have failed.
  32         defaultProbeTimeout = 30 * time.Second
  33
  34         // The time given for the container to start before the health check starts considering
  35         // the container unstable. Defaults to none.
  36         defaultStartPeriod = 0 * time.Second
  37
  38         // Default number of consecutive failures of the health check
  39         // for the container to be considered unhealthy.
  40         defaultProbeRetries = 3
  41
  42         // Maximum number of entries to record
  43         maxLogEntries = 5
  44 )
  45
  46 const (
  47         // Exit status codes that can be returned by the probe command.
  48
  49         exitStatusHealthy   = 0 // Container is healthy
  50         exitStatusUnhealthy = 1 // Container is unhealthy
  51 )
  52
  53 // probe implementations know how to run a particular type of probe.
  54 type probe interface {
  55         // Perform one run of the check. Returns the exit code and an optional
  56         // short diagnostic string.
  57         run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
  58 }
  59
  60 // cmdProbe implements the "CMD" probe type.
  61 type cmdProbe struct {
  62         // Run the command with the system's default shell instead of execing it directly.
  63         shell bool
  64 }
  65
  66 // exec the healthcheck command in the container.
  67 // Returns the exit code and probe output (if any)
  68 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
  69         cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
  70         if p.shell {
  71                 cmdSlice = append(getShell(cntr.Config), cmdSlice...)
  72         }
  73         entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
  74         execConfig := exec.NewConfig()
  75         execConfig.OpenStdin = false
  76         execConfig.OpenStdout = true
  77         execConfig.OpenStderr = true
  78         execConfig.ContainerID = cntr.ID
  79         execConfig.DetachKeys = []byte{}
  80         execConfig.Entrypoint = entrypoint
  81         execConfig.Args = args
  82         execConfig.Tty = false
  83         execConfig.Privileged = false
  84         execConfig.User = cntr.Config.User
  85
  86         linkedEnv, err := d.setupLinkedContainers(cntr)
  87         if err != nil {
  88                 return nil, err
  89         }
  90         execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
  91
  92         d.registerExecCommand(cntr, execConfig)
  93         d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
  94
  95         output := &limitedBuffer{}
  96         err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
  97         if err != nil {
  98                 return nil, err
  99         }
 100         info, err := d.getExecConfig(execConfig.ID)
 101         if err != nil {
 102                 return nil, err
 103         }
 104         if info.ExitCode == nil {
 105                 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", cntr.ID)
 106         }
 107         // Note: Go's json package will handle invalid UTF-8 for us
 108         out := output.String()
 109         return &types.HealthcheckResult{
 110                 End:      time.Now(),
 111                 ExitCode: *info.ExitCode,
 112                 Output:   out,
 113         }, nil
 114 }
 115
 116 // Update the container's Status.Health struct based on the latest probe's result.
 117 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
 118         c.Lock()
 119         defer c.Unlock()
 120
 121         // probe may have been cancelled while waiting on lock. Ignore result then
 122         select {
 123         case <-done:
 124                 return
 125         default:
 126         }
 127
 128         retries := c.Config.Healthcheck.Retries
 129         if retries <= 0 {
 130                 retries = defaultProbeRetries
 131         }
 132
 133         h := c.State.Health
 134         oldStatus := h.Status
 135
 136         if len(h.Log) >= maxLogEntries {
 137                 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
 138         } else {
 139                 h.Log = append(h.Log, result)
 140         }
 141
 142         if result.ExitCode == exitStatusHealthy {
 143                 h.FailingStreak = 0
 144                 h.Status = types.Healthy
 145         } else { // Failure (including invalid exit code)
 146                 shouldIncrementStreak := true
 147
 148                 // If the container is starting (i.e. we never had a successful health check)
 149                 // then we check if we are within the start period of the container in which
 150                 // case we do not increment the failure streak.
 151                 if h.Status == types.Starting {
 152                         startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
 153                         timeSinceStart := result.Start.Sub(c.State.StartedAt)
 154
 155                         // If still within the start period, then don't increment failing streak.
 156                         if timeSinceStart < startPeriod {
 157                                 shouldIncrementStreak = false
 158                         }
 159                 }
 160
 161                 if shouldIncrementStreak {
 162                         h.FailingStreak++
 163
 164                         if h.FailingStreak >= retries {
 165                                 h.Status = types.Unhealthy
 166                         }
 167                 }
 168                 // Else we're starting or healthy. Stay in that state.
 169         }
 170
 171         // replicate Health status changes
 172         if err := c.CheckpointTo(d.containersReplica); err != nil {
 173                 // queries will be inconsistent until the next probe runs or other state mutations
 174                 // checkpoint the container
 175                 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
 176         }
 177
 178         if oldStatus != h.Status {
 179                 d.LogContainerEvent(c, "health_status: "+h.Status)
 180
 181                 if d.HasExperimental() && h.Status == types.Unhealthy {
 182                         restart, wait, err := c.RestartManager().ShouldRestart(0, false, time.Since(c.StartedAt), c.Health.Health)
 183                         if err == nil && restart {
 184                                 logrus.Infof("Unhealthy container %v: restarting...", c.ID)
 185                                 go func() {
 186                                         err := <-wait
 187                                         if err == nil {
 188                                                 d.stopHealthchecks(c)
 189                                                 if err := d.containerRestart(c, c.StopTimeout()); err != nil {
 190                                                         logrus.Debugf("failed to restart container: %+v", err)
 191                                                 }
 192                                         } else if err != restartmanager.ErrRestartCanceled {
 193                                                 logrus.Errorf("restartmanger wait error: %+v", err)
 194                                         }
 195                                 }()
 196                         }
 197                 }
 198         }
 199 }
 200
 201 // Run the container's monitoring thread until notified via "stop".
 202 // There is never more than one monitor thread running per container at a time.
 203 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
 204         probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
 205         probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
 206         for {
 207                 select {
 208                 case <-stop:
 209                         logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
 210                         return
 211                 case <-time.After(probeInterval):
 212                         logrus.Debugf("Running health check for container %s ...", c.ID)
 213                         startTime := time.Now()
 214                         ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
 215                         results := make(chan *types.HealthcheckResult, 1)
 216                         go func() {
 217                                 healthChecksCounter.Inc()
 218                                 result, err := probe.run(ctx, d, c)
 219                                 if err != nil {
 220                                         healthChecksFailedCounter.Inc()
 221                                         logrus.Warnf("Health check for container %s error: %v", c.ID, err)
 222                                         results <- &types.HealthcheckResult{
 223                                                 ExitCode: -1,
 224                                                 Output:   err.Error(),
 225                                                 Start:    startTime,
 226                                                 End:      time.Now(),
 227                                         }
 228                                 } else {
 229                                         result.Start = startTime
 230                                         logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
 231                                         results <- result
 232                                 }
 233                                 close(results)
 234                         }()
 235                         select {
 236                         case <-stop:
 237                                 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
 238                                 cancelProbe()
 239                                 // Wait for probe to exit (it might take a while to respond to the TERM
 240                                 // signal and we don't want dying probes to pile up).
 241                                 <-results
 242                                 return
 243                         case result := <-results:
 244                                 handleProbeResult(d, c, result, stop)
 245                                 // Stop timeout
 246                                 cancelProbe()
 247                         case <-ctx.Done():
 248                                 logrus.Debugf("Health check for container %s taking too long", c.ID)
 249                                 handleProbeResult(d, c, &types.HealthcheckResult{
 250                                         ExitCode: -1,
 251                                         Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
 252                                         Start:    startTime,
 253                                         End:      time.Now(),
 254                                 }, stop)
 255                                 cancelProbe()
 256                                 // Wait for probe to exit (it might take a while to respond to the TERM
 257                                 // signal and we don't want dying probes to pile up).
 258                                 <-results
 259                         }
 260                 }
 261         }
 262 }
 263
 264 // Get a suitable probe implementation for the container's healthcheck configuration.
 265 // Nil will be returned if no healthcheck was configured or NONE was set.
 266 func getProbe(c *container.Container) probe {
 267         config := c.Config.Healthcheck
 268         if config == nil || len(config.Test) == 0 {
 269                 return nil
 270         }
 271         switch config.Test[0] {
 272         case "CMD":
 273                 return &cmdProbe{shell: false}
 274         case "CMD-SHELL":
 275                 return &cmdProbe{shell: true}
 276         default:
 277                 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
 278                 return nil
 279         }
 280 }
 281
 282 // Ensure the health-check monitor is running or not, depending on the current
 283 // state of the container.
 284 // Called from monitor.go, with c locked.
 285 func (d *Daemon) updateHealthMonitor(c *container.Container) {
 286         h := c.State.Health
 287         if h == nil {
 288                 return // No healthcheck configured
 289         }
 290
 291         probe := getProbe(c)
 292         wantRunning := c.Running && !c.Paused && probe != nil
 293         if wantRunning {
 294                 if stop := h.OpenMonitorChannel(); stop != nil {
 295                         go monitor(d, c, stop, probe)
 296                 }
 297         } else {
 298                 h.CloseMonitorChannel()
 299         }
 300 }
 301
 302 // Reset the health state for a newly-started, restarted or restored container.
 303 // initHealthMonitor is called from monitor.go and we should never be running
 304 // two instances at once.
 305 // Called with c locked.
 306 func (d *Daemon) initHealthMonitor(c *container.Container) {
 307         // If no healthcheck is setup then don't init the monitor
 308         if getProbe(c) == nil {
 309                 return
 310         }
 311
 312         // This is needed in case we're auto-restarting
 313         d.stopHealthchecks(c)
 314
 315         if h := c.State.Health; h != nil {
 316                 h.Status = types.Starting
 317                 h.FailingStreak = 0
 318         } else {
 319                 h := &container.Health{}
 320                 h.Status = types.Starting
 321                 c.State.Health = h
 322         }
 323
 324         d.updateHealthMonitor(c)
 325 }
 326
 327 // Called when the container is being stopped (whether because the health check is
 328 // failing or for any other reason).
 329 func (d *Daemon) stopHealthchecks(c *container.Container) {
 330         h := c.State.Health
 331         if h != nil {
 332                 h.CloseMonitorChannel()
 333         }
 334 }
 335
 336 // Buffer up to maxOutputLen bytes. Further data is discarded.
 337 type limitedBuffer struct {
 338         buf       bytes.Buffer
 339         mu        sync.Mutex
 340         truncated bool // indicates that data has been lost
 341 }
 342
 343 // Append to limitedBuffer while there is room.
 344 func (b *limitedBuffer) Write(data []byte) (int, error) {
 345         b.mu.Lock()
 346         defer b.mu.Unlock()
 347
 348         bufLen := b.buf.Len()
 349         dataLen := len(data)
 350         keep := min(maxOutputLen-bufLen, dataLen)
 351         if keep > 0 {
 352                 b.buf.Write(data[:keep])
 353         }
 354         if keep < dataLen {
 355                 b.truncated = true
 356         }
 357         return dataLen, nil
 358 }
 359
 360 // The contents of the buffer, with "..." appended if it overflowed.
 361 func (b *limitedBuffer) String() string {
 362         b.mu.Lock()
 363         defer b.mu.Unlock()
 364
 365         out := b.buf.String()
 366         if b.truncated {
 367                 out = out + "..."
 368         }
 369         return out
 370 }
 371
 372 // If configuredValue is zero, use defaultValue instead.
 373 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
 374         if configuredValue == 0 {
 375                 return defaultValue
 376         }
 377         return configuredValue
 378 }
 379
 380 func min(x, y int) int {
 381         if x < y {
 382                 return x
 383         }
 384         return y
 385 }
 386
 387 func getShell(config *containertypes.Config) []string {
 388         if len(config.Shell) != 0 {
 389                 return config.Shell
 390         }
 391         if runtime.GOOS != "windows" {
 392                 return []string{"/bin/sh", "-c"}
 393         }
 394         return []string{"cmd", "/S", "/C"}
 395 }