15 "github.com/Sirupsen/logrus"
16 "github.com/opencontainers/runc/libcontainer/cgroups"
17 "github.com/opencontainers/runc/libcontainer/configs"
18 "github.com/opencontainers/runc/libcontainer/system"
19 "github.com/opencontainers/runc/libcontainer/user"
20 "github.com/opencontainers/runc/libcontainer/utils"
21 "github.com/vishvananda/netlink"
27 initSetns initType = "setns"
28 initStandard initType = "standard"
35 // network is an internal struct used to setup container networks.
39 // TempVethPeerName is a unique temporary veth peer name that was placed into
40 // the container's namespace.
41 TempVethPeerName string `json:"temp_veth_peer_name"`
44 // initConfig is used for transferring parameters from Exec() to Init()
45 type initConfig struct {
46 Args []string `json:"args"`
47 Env []string `json:"env"`
48 Cwd string `json:"cwd"`
49 Capabilities *configs.Capabilities `json:"capabilities"`
50 ProcessLabel string `json:"process_label"`
51 AppArmorProfile string `json:"apparmor_profile"`
52 NoNewPrivileges bool `json:"no_new_privileges"`
53 User string `json:"user"`
54 AdditionalGroups []string `json:"additional_groups"`
55 Config *configs.Config `json:"config"`
56 Networks []*network `json:"network"`
57 PassedFilesCount int `json:"passed_files_count"`
58 ContainerId string `json:"containerid"`
59 Rlimits []configs.Rlimit `json:"rlimits"`
60 CreateConsole bool `json:"create_console"`
61 Rootless bool `json:"rootless"`
64 type initer interface {
68 func newContainerInit(t initType, pipe *os.File, consoleSocket *os.File, stateDirFD int) (initer, error) {
69 var config *initConfig
70 if err := json.NewDecoder(pipe).Decode(&config); err != nil {
73 if err := populateProcessEnvironment(config.Env); err != nil {
78 return &linuxSetnsInit{
80 consoleSocket: consoleSocket,
84 return &linuxStandardInit{
86 consoleSocket: consoleSocket,
87 parentPid: syscall.Getppid(),
89 stateDirFD: stateDirFD,
92 return nil, fmt.Errorf("unknown init type %q", t)
95 // populateProcessEnvironment loads the provided environment variables into the
96 // current processes's environment.
97 func populateProcessEnvironment(env []string) error {
98 for _, pair := range env {
99 p := strings.SplitN(pair, "=", 2)
101 return fmt.Errorf("invalid environment '%v'", pair)
103 if err := os.Setenv(p[0], p[1]); err != nil {
110 // finalizeNamespace drops the caps, sets the correct user
111 // and working dir, and closes any leaked file descriptors
112 // before executing the command inside the namespace
113 func finalizeNamespace(config *initConfig) error {
114 // Ensure that all unwanted fds we may have accidentally
115 // inherited are marked close-on-exec so they stay out of the
117 if err := utils.CloseExecFrom(config.PassedFilesCount + 3); err != nil {
121 capabilities := &configs.Capabilities{}
122 if config.Capabilities != nil {
123 capabilities = config.Capabilities
124 } else if config.Config.Capabilities != nil {
125 capabilities = config.Config.Capabilities
127 w, err := newContainerCapList(capabilities)
131 // drop capabilities in bounding set before changing user
132 if err := w.ApplyBoundingSet(); err != nil {
135 // preserve existing capabilities while we change users
136 if err := system.SetKeepCaps(); err != nil {
139 if err := setupUser(config); err != nil {
142 if err := system.ClearKeepCaps(); err != nil {
145 if err := w.ApplyCaps(); err != nil {
148 if config.Cwd != "" {
149 if err := syscall.Chdir(config.Cwd); err != nil {
150 return fmt.Errorf("chdir to cwd (%q) set in config.json failed: %v", config.Cwd, err)
156 // setupConsole sets up the console from inside the container, and sends the
157 // master pty fd to the config.Pipe (using cmsg). This is done to ensure that
158 // consoles are scoped to a container properly (see runc#814 and the many
159 // issues related to that). This has to be run *after* we've pivoted to the new
160 // rootfs (and the users' configuration is entirely set up).
161 func setupConsole(socket *os.File, config *initConfig, mount bool) error {
163 // At this point, /dev/ptmx points to something that we would expect. We
164 // used to change the owner of the slave path, but since the /dev/pts mount
165 // can have gid=X set (at the users' option). So touching the owner of the
166 // slave PTY is not necessary, as the kernel will handle that for us. Note
167 // however, that setupUser (specifically fixStdioPermissions) *will* change
168 // the UID owner of the console to be the user the process will run as (so
169 // they can actually control their console).
170 console, err := newConsole()
174 // After we return from here, we don't need the console anymore.
175 defer console.Close()
177 linuxConsole, ok := console.(*linuxConsole)
179 return fmt.Errorf("failed to cast console to *linuxConsole")
181 // Mount the console inside our rootfs.
183 if err := linuxConsole.mount(); err != nil {
187 // While we can access console.master, using the API is a good idea.
188 if err := utils.SendFd(socket, linuxConsole.File()); err != nil {
191 // Now, dup over all the things.
192 return linuxConsole.dupStdio()
195 // syncParentReady sends to the given pipe a JSON payload which indicates that
196 // the init is ready to Exec the child process. It then waits for the parent to
197 // indicate that it is cleared to Exec.
198 func syncParentReady(pipe io.ReadWriter) error {
200 if err := writeSync(pipe, procReady); err != nil {
204 // Wait for parent to give the all-clear.
205 if err := readSync(pipe, procRun); err != nil {
212 // syncParentHooks sends to the given pipe a JSON payload which indicates that
213 // the parent should execute pre-start hooks. It then waits for the parent to
214 // indicate that it is cleared to resume.
215 func syncParentHooks(pipe io.ReadWriter) error {
217 if err := writeSync(pipe, procHooks); err != nil {
221 // Wait for parent to give the all-clear.
222 if err := readSync(pipe, procResume); err != nil {
229 // setupUser changes the groups, gid, and uid for the user inside the container
230 func setupUser(config *initConfig) error {
232 defaultExecUser := user.ExecUser{
238 passwdPath, err := user.GetPasswdPath()
243 groupPath, err := user.GetGroupPath()
248 execUser, err := user.GetExecUserPath(config.User, &defaultExecUser, passwdPath, groupPath)
254 if len(config.AdditionalGroups) > 0 {
255 addGroups, err = user.GetAdditionalGroupsPath(config.AdditionalGroups, groupPath)
262 if execUser.Uid != 0 {
263 return fmt.Errorf("cannot run as a non-root user in a rootless container")
266 if execUser.Gid != 0 {
267 return fmt.Errorf("cannot run as a non-root group in a rootless container")
270 // We cannot set any additional groups in a rootless container and thus we
271 // bail if the user asked us to do so. TODO: We currently can't do this
272 // earlier, but if libcontainer.Process.User was typesafe this might work.
273 if len(addGroups) > 0 {
274 return fmt.Errorf("cannot set any additional groups in a rootless container")
278 // before we change to the container's user make sure that the processes STDIO
279 // is correctly owned by the user that we are switching to.
280 if err := fixStdioPermissions(config, execUser); err != nil {
284 // This isn't allowed in an unprivileged user namespace since Linux 3.19.
285 // There's nothing we can do about /etc/group entries, so we silently
286 // ignore setting groups here (since the user didn't explicitly ask us to
288 if !config.Rootless {
289 suppGroups := append(execUser.Sgids, addGroups...)
290 if err := syscall.Setgroups(suppGroups); err != nil {
295 if err := system.Setgid(execUser.Gid); err != nil {
299 if err := system.Setuid(execUser.Uid); err != nil {
303 // if we didn't get HOME already, set it based on the user's HOME
304 if envHome := os.Getenv("HOME"); envHome == "" {
305 if err := os.Setenv("HOME", execUser.Home); err != nil {
312 // fixStdioPermissions fixes the permissions of PID 1's STDIO within the container to the specified user.
313 // The ownership needs to match because it is created outside of the container and needs to be
315 func fixStdioPermissions(config *initConfig, u *user.ExecUser) error {
316 var null syscall.Stat_t
317 if err := syscall.Stat("/dev/null", &null); err != nil {
320 for _, fd := range []uintptr{
326 if err := syscall.Fstat(int(fd), &s); err != nil {
330 // Skip chown of /dev/null if it was used as one of the STDIO fds.
331 if s.Rdev == null.Rdev {
335 // Skip chown if s.Gid is actually an unmapped gid in the host. While
336 // this is a bit dodgy if it just so happens that the console _is_
337 // owned by overflow_gid, there's no way for us to disambiguate this as
338 // a userspace program.
339 if _, err := config.Config.HostGID(int(s.Gid)); err != nil {
343 // We only change the uid owner (as it is possible for the mount to
344 // prefer a different gid, and there's no reason for us to change it).
345 // The reason why we don't just leave the default uid=X mount setup is
346 // that users expect to be able to actually use their console. Without
347 // this code, you couldn't effectively run as a non-root user inside a
348 // container and also have a console set up.
349 if err := syscall.Fchown(int(fd), u.Uid, int(s.Gid)); err != nil {
356 // setupNetwork sets up and initializes any network interface inside the container.
357 func setupNetwork(config *initConfig) error {
358 for _, config := range config.Networks {
359 strategy, err := getStrategy(config.Type)
363 if err := strategy.initialize(config); err != nil {
370 func setupRoute(config *configs.Config) error {
371 for _, config := range config.Routes {
372 _, dst, err := net.ParseCIDR(config.Destination)
376 src := net.ParseIP(config.Source)
378 return fmt.Errorf("Invalid source for route: %s", config.Source)
380 gw := net.ParseIP(config.Gateway)
382 return fmt.Errorf("Invalid gateway for route: %s", config.Gateway)
384 l, err := netlink.LinkByName(config.InterfaceName)
388 route := &netlink.Route{
389 Scope: netlink.SCOPE_UNIVERSE,
393 LinkIndex: l.Attrs().Index,
395 if err := netlink.RouteAdd(route); err != nil {
402 func setupRlimits(limits []configs.Rlimit, pid int) error {
403 for _, rlimit := range limits {
404 if err := system.Prlimit(pid, rlimit.Type, syscall.Rlimit{Max: rlimit.Hard, Cur: rlimit.Soft}); err != nil {
405 return fmt.Errorf("error setting rlimit type %v: %v", rlimit.Type, err)
413 type siginfo struct {
417 // below here is a union; si_pid is the only field we use
419 // Pad to 128 bytes as detailed in blockUntilWaitable
423 // isWaitable returns true if the process has exited false otherwise.
424 // Its based off blockUntilWaitable in src/os/wait_waitid.go
425 func isWaitable(pid int) (bool, error) {
427 _, _, e := syscall.Syscall6(syscall.SYS_WAITID, _P_PID, uintptr(pid), uintptr(unsafe.Pointer(si)), syscall.WEXITED|syscall.WNOWAIT|syscall.WNOHANG, 0, 0)
429 return false, os.NewSyscallError("waitid", e)
432 return si.si_pid != 0, nil
435 // isNoChildren returns true if err represents a syscall.ECHILD false otherwise
436 func isNoChildren(err error) bool {
437 switch err := err.(type) {
439 if err == syscall.ECHILD {
442 case *os.SyscallError:
443 if err.Err == syscall.ECHILD {
450 // signalAllProcesses freezes then iterates over all the processes inside the
451 // manager's cgroups sending the signal s to them.
452 // If s is SIGKILL then it will wait for each process to exit.
453 // For all other signals it will check if the process is ready to report its
454 // exit status and only if it is will a wait be performed.
455 func signalAllProcesses(m cgroups.Manager, s os.Signal) error {
456 var procs []*os.Process
457 if err := m.Freeze(configs.Frozen); err != nil {
460 pids, err := m.GetAllPids()
462 m.Freeze(configs.Thawed)
465 for _, pid := range pids {
466 p, err := os.FindProcess(pid)
471 procs = append(procs, p)
472 if err := p.Signal(s); err != nil {
476 if err := m.Freeze(configs.Thawed); err != nil {
480 for _, p := range procs {
481 if s != syscall.SIGKILL {
482 if ok, err := isWaitable(p.Pid); err != nil {
483 if !isNoChildren(err) {
484 logrus.Warn("signalAllProcesses: ", p.Pid, err)
488 // Not ready to report so don't wait
493 if _, err := p.Wait(); err != nil {
494 if !isNoChildren(err) {
495 logrus.Warn("wait: ", err)