fix compile error
[platform/upstream/docker-engine.git] / daemon / health.go
1 package daemon
2
3 import (
4         "bytes"
5         "fmt"
6         "runtime"
7         "strings"
8         "sync"
9         "time"
10
11         "golang.org/x/net/context"
12
13         "github.com/Sirupsen/logrus"
14         "github.com/docker/docker/api/types"
15         containertypes "github.com/docker/docker/api/types/container"
16         "github.com/docker/docker/api/types/strslice"
17         "github.com/docker/docker/container"
18         "github.com/docker/docker/daemon/exec"
19         "github.com/docker/docker/restartmanager"
20 )
21
22 const (
23         // Longest healthcheck probe output message to store. Longer messages will be truncated.
24         maxOutputLen = 4096
25
26         // Default interval between probe runs (from the end of the first to the start of the second).
27         // Also the time before the first probe.
28         defaultProbeInterval = 30 * time.Second
29
30         // The maximum length of time a single probe run should take. If the probe takes longer
31         // than this, the check is considered to have failed.
32         defaultProbeTimeout = 30 * time.Second
33
34         // The time given for the container to start before the health check starts considering
35         // the container unstable. Defaults to none.
36         defaultStartPeriod = 0 * time.Second
37
38         // Default number of consecutive failures of the health check
39         // for the container to be considered unhealthy.
40         defaultProbeRetries = 3
41
42         // Maximum number of entries to record
43         maxLogEntries = 5
44 )
45
46 const (
47         // Exit status codes that can be returned by the probe command.
48
49         exitStatusHealthy   = 0 // Container is healthy
50         exitStatusUnhealthy = 1 // Container is unhealthy
51 )
52
53 // probe implementations know how to run a particular type of probe.
54 type probe interface {
55         // Perform one run of the check. Returns the exit code and an optional
56         // short diagnostic string.
57         run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
58 }
59
60 // cmdProbe implements the "CMD" probe type.
61 type cmdProbe struct {
62         // Run the command with the system's default shell instead of execing it directly.
63         shell bool
64 }
65
66 // exec the healthcheck command in the container.
67 // Returns the exit code and probe output (if any)
68 func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
69         cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
70         if p.shell {
71                 cmdSlice = append(getShell(cntr.Config), cmdSlice...)
72         }
73         entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
74         execConfig := exec.NewConfig()
75         execConfig.OpenStdin = false
76         execConfig.OpenStdout = true
77         execConfig.OpenStderr = true
78         execConfig.ContainerID = cntr.ID
79         execConfig.DetachKeys = []byte{}
80         execConfig.Entrypoint = entrypoint
81         execConfig.Args = args
82         execConfig.Tty = false
83         execConfig.Privileged = false
84         execConfig.User = cntr.Config.User
85
86         linkedEnv, err := d.setupLinkedContainers(cntr)
87         if err != nil {
88                 return nil, err
89         }
90         execConfig.Env = container.ReplaceOrAppendEnvValues(cntr.CreateDaemonEnvironment(execConfig.Tty, linkedEnv), execConfig.Env)
91
92         d.registerExecCommand(cntr, execConfig)
93         d.LogContainerEvent(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
94
95         output := &limitedBuffer{}
96         err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
97         if err != nil {
98                 return nil, err
99         }
100         info, err := d.getExecConfig(execConfig.ID)
101         if err != nil {
102                 return nil, err
103         }
104         if info.ExitCode == nil {
105                 return nil, fmt.Errorf("Healthcheck for container %s has no exit code!", cntr.ID)
106         }
107         // Note: Go's json package will handle invalid UTF-8 for us
108         out := output.String()
109         return &types.HealthcheckResult{
110                 End:      time.Now(),
111                 ExitCode: *info.ExitCode,
112                 Output:   out,
113         }, nil
114 }
115
116 // Update the container's Status.Health struct based on the latest probe's result.
117 func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult, done chan struct{}) {
118         c.Lock()
119         defer c.Unlock()
120
121         // probe may have been cancelled while waiting on lock. Ignore result then
122         select {
123         case <-done:
124                 return
125         default:
126         }
127
128         retries := c.Config.Healthcheck.Retries
129         if retries <= 0 {
130                 retries = defaultProbeRetries
131         }
132
133         h := c.State.Health
134         oldStatus := h.Status
135
136         if len(h.Log) >= maxLogEntries {
137                 h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
138         } else {
139                 h.Log = append(h.Log, result)
140         }
141
142         if result.ExitCode == exitStatusHealthy {
143                 h.FailingStreak = 0
144                 h.Status = types.Healthy
145         } else { // Failure (including invalid exit code)
146                 shouldIncrementStreak := true
147
148                 // If the container is starting (i.e. we never had a successful health check)
149                 // then we check if we are within the start period of the container in which
150                 // case we do not increment the failure streak.
151                 if h.Status == types.Starting {
152                         startPeriod := timeoutWithDefault(c.Config.Healthcheck.StartPeriod, defaultStartPeriod)
153                         timeSinceStart := result.Start.Sub(c.State.StartedAt)
154
155                         // If still within the start period, then don't increment failing streak.
156                         if timeSinceStart < startPeriod {
157                                 shouldIncrementStreak = false
158                         }
159                 }
160
161                 if shouldIncrementStreak {
162                         h.FailingStreak++
163
164                         if h.FailingStreak >= retries {
165                                 h.Status = types.Unhealthy
166                         }
167                 }
168                 // Else we're starting or healthy. Stay in that state.
169         }
170
171         // replicate Health status changes
172         if err := c.CheckpointTo(d.containersReplica); err != nil {
173                 // queries will be inconsistent until the next probe runs or other state mutations
174                 // checkpoint the container
175                 logrus.Errorf("Error replicating health state for container %s: %v", c.ID, err)
176         }
177
178         if oldStatus != h.Status {
179                 d.LogContainerEvent(c, "health_status: "+h.Status)
180
181                 if d.HasExperimental() && h.Status == types.Unhealthy {
182                         restart, wait, err := c.RestartManager().ShouldRestart(0, false, time.Since(c.StartedAt), c.Health.Health)
183                         if err == nil && restart {
184                                 logrus.Infof("Unhealthy container %v: restarting...", c.ID)
185                                 go func() {
186                                         err := <-wait
187                                         if err == nil {
188                                                 d.stopHealthchecks(c)
189                                                 if err := d.containerRestart(c, c.StopTimeout()); err != nil {
190                                                         logrus.Debugf("failed to restart container: %+v", err)
191                                                 }
192                                         } else if err != restartmanager.ErrRestartCanceled {
193                                                 logrus.Errorf("restartmanger wait error: %+v", err)
194                                         }
195                                 }()
196                         }
197                 }
198         }
199 }
200
201 // Run the container's monitoring thread until notified via "stop".
202 // There is never more than one monitor thread running per container at a time.
203 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
204         probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
205         probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
206         for {
207                 select {
208                 case <-stop:
209                         logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
210                         return
211                 case <-time.After(probeInterval):
212                         logrus.Debugf("Running health check for container %s ...", c.ID)
213                         startTime := time.Now()
214                         ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
215                         results := make(chan *types.HealthcheckResult, 1)
216                         go func() {
217                                 healthChecksCounter.Inc()
218                                 result, err := probe.run(ctx, d, c)
219                                 if err != nil {
220                                         healthChecksFailedCounter.Inc()
221                                         logrus.Warnf("Health check for container %s error: %v", c.ID, err)
222                                         results <- &types.HealthcheckResult{
223                                                 ExitCode: -1,
224                                                 Output:   err.Error(),
225                                                 Start:    startTime,
226                                                 End:      time.Now(),
227                                         }
228                                 } else {
229                                         result.Start = startTime
230                                         logrus.Debugf("Health check for container %s done (exitCode=%d)", c.ID, result.ExitCode)
231                                         results <- result
232                                 }
233                                 close(results)
234                         }()
235                         select {
236                         case <-stop:
237                                 logrus.Debugf("Stop healthcheck monitoring for container %s (received while probing)", c.ID)
238                                 cancelProbe()
239                                 // Wait for probe to exit (it might take a while to respond to the TERM
240                                 // signal and we don't want dying probes to pile up).
241                                 <-results
242                                 return
243                         case result := <-results:
244                                 handleProbeResult(d, c, result, stop)
245                                 // Stop timeout
246                                 cancelProbe()
247                         case <-ctx.Done():
248                                 logrus.Debugf("Health check for container %s taking too long", c.ID)
249                                 handleProbeResult(d, c, &types.HealthcheckResult{
250                                         ExitCode: -1,
251                                         Output:   fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
252                                         Start:    startTime,
253                                         End:      time.Now(),
254                                 }, stop)
255                                 cancelProbe()
256                                 // Wait for probe to exit (it might take a while to respond to the TERM
257                                 // signal and we don't want dying probes to pile up).
258                                 <-results
259                         }
260                 }
261         }
262 }
263
264 // Get a suitable probe implementation for the container's healthcheck configuration.
265 // Nil will be returned if no healthcheck was configured or NONE was set.
266 func getProbe(c *container.Container) probe {
267         config := c.Config.Healthcheck
268         if config == nil || len(config.Test) == 0 {
269                 return nil
270         }
271         switch config.Test[0] {
272         case "CMD":
273                 return &cmdProbe{shell: false}
274         case "CMD-SHELL":
275                 return &cmdProbe{shell: true}
276         default:
277                 logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD') in container %s", config.Test[0], c.ID)
278                 return nil
279         }
280 }
281
282 // Ensure the health-check monitor is running or not, depending on the current
283 // state of the container.
284 // Called from monitor.go, with c locked.
285 func (d *Daemon) updateHealthMonitor(c *container.Container) {
286         h := c.State.Health
287         if h == nil {
288                 return // No healthcheck configured
289         }
290
291         probe := getProbe(c)
292         wantRunning := c.Running && !c.Paused && probe != nil
293         if wantRunning {
294                 if stop := h.OpenMonitorChannel(); stop != nil {
295                         go monitor(d, c, stop, probe)
296                 }
297         } else {
298                 h.CloseMonitorChannel()
299         }
300 }
301
302 // Reset the health state for a newly-started, restarted or restored container.
303 // initHealthMonitor is called from monitor.go and we should never be running
304 // two instances at once.
305 // Called with c locked.
306 func (d *Daemon) initHealthMonitor(c *container.Container) {
307         // If no healthcheck is setup then don't init the monitor
308         if getProbe(c) == nil {
309                 return
310         }
311
312         // This is needed in case we're auto-restarting
313         d.stopHealthchecks(c)
314
315         if h := c.State.Health; h != nil {
316                 h.Status = types.Starting
317                 h.FailingStreak = 0
318         } else {
319                 h := &container.Health{}
320                 h.Status = types.Starting
321                 c.State.Health = h
322         }
323
324         d.updateHealthMonitor(c)
325 }
326
327 // Called when the container is being stopped (whether because the health check is
328 // failing or for any other reason).
329 func (d *Daemon) stopHealthchecks(c *container.Container) {
330         h := c.State.Health
331         if h != nil {
332                 h.CloseMonitorChannel()
333         }
334 }
335
336 // Buffer up to maxOutputLen bytes. Further data is discarded.
337 type limitedBuffer struct {
338         buf       bytes.Buffer
339         mu        sync.Mutex
340         truncated bool // indicates that data has been lost
341 }
342
343 // Append to limitedBuffer while there is room.
344 func (b *limitedBuffer) Write(data []byte) (int, error) {
345         b.mu.Lock()
346         defer b.mu.Unlock()
347
348         bufLen := b.buf.Len()
349         dataLen := len(data)
350         keep := min(maxOutputLen-bufLen, dataLen)
351         if keep > 0 {
352                 b.buf.Write(data[:keep])
353         }
354         if keep < dataLen {
355                 b.truncated = true
356         }
357         return dataLen, nil
358 }
359
360 // The contents of the buffer, with "..." appended if it overflowed.
361 func (b *limitedBuffer) String() string {
362         b.mu.Lock()
363         defer b.mu.Unlock()
364
365         out := b.buf.String()
366         if b.truncated {
367                 out = out + "..."
368         }
369         return out
370 }
371
372 // If configuredValue is zero, use defaultValue instead.
373 func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
374         if configuredValue == 0 {
375                 return defaultValue
376         }
377         return configuredValue
378 }
379
380 func min(x, y int) int {
381         if x < y {
382                 return x
383         }
384         return y
385 }
386
387 func getShell(config *containertypes.Config) []string {
388         if len(config.Shell) != 0 {
389                 return config.Shell
390         }
391         if runtime.GOOS != "windows" {
392                 return []string{"/bin/sh", "-c"}
393         }
394         return []string{"cmd", "/S", "/C"}
395 }