Initial import
authorRobert Swiecki <swiecki@google.com>
Thu, 14 May 2015 21:44:48 +0000 (23:44 +0200)
committerRobert Swiecki <swiecki@google.com>
Thu, 14 May 2015 21:44:48 +0000 (23:44 +0200)
21 files changed:
CONTRIBUTING [new file with mode: 0644]
LICENSE [new file with mode: 0644]
Makefile [new file with mode: 0644]
README [new file with mode: 0644]
cmdline.c [new file with mode: 0644]
cmdline.h [new file with mode: 0644]
common.h [new file with mode: 0644]
contain.c [new file with mode: 0644]
contain.h [new file with mode: 0644]
log.c [new file with mode: 0644]
log.h [new file with mode: 0644]
net.c [new file with mode: 0644]
net.h [new file with mode: 0644]
nsjail.c [new file with mode: 0644]
nsjail.h [new file with mode: 0644]
sandbox.c [new file with mode: 0644]
sandbox.h [new file with mode: 0644]
seccomp/bpf-helper.c [new file with mode: 0644]
seccomp/bpf-helper.h [new file with mode: 0644]
subproc.c [new file with mode: 0644]
subproc.h [new file with mode: 0644]

diff --git a/CONTRIBUTING b/CONTRIBUTING
new file mode 100644 (file)
index 0000000..1ba8539
--- /dev/null
@@ -0,0 +1,24 @@
+Want to contribute? Great! First, read this page (including the small print at the end).
+
+### Before you contribute
+Before we can use your code, you must sign the
+[Google Individual Contributor License Agreement](https://developers.google.com/open-source/cla/individual?csw=1)
+(CLA), which you can do online. The CLA is necessary mainly because you own the
+copyright to your changes, even after your contribution becomes part of our
+codebase, so we need your permission to use and distribute your code. We also
+need to be sure of various other things—for instance that you'll tell us if you
+know that your code infringes on other people's patents. You don't have to sign
+the CLA until after you've submitted your code for review and a member has
+approved it, but you must do it before we can put your code into our codebase.
+Before you start working on a larger contribution, you should get in touch with
+us first through the issue tracker with your idea so that we can help out and
+possibly guide you. Coordinating up front makes it much easier to avoid
+frustration later on.
+
+### Code reviews
+All submissions, including submissions by project members, require review. We
+use Github pull requests for this purpose.
+
+### The small print
+Contributions made by corporations are covered by a different agreement than
+the one above, the Software Grant and Corporate Contributor License Agreement.
diff --git a/LICENSE b/LICENSE
new file mode 100644 (file)
index 0000000..d645695
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/Makefile b/Makefile
new file mode 100644 (file)
index 0000000..aaa24c3
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,59 @@
+#
+#   nsjail - Makefile
+#      -----------------------------------------
+#
+#   Copyright 2014 Google Inc. All Rights Reserved.
+#
+#   Licensed under the Apache License, Version 2.0 (the "License");
+#   you may not use this file except in compliance with the License.
+#   You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#   Unless required by applicable law or agreed to in writing, software
+#   distributed under the License is distributed on an "AS IS" BASIS,
+#   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#   See the License for the specific language governing permissions and
+#   limitations under the License.
+#
+
+CC = gcc
+CFLAGS += -O2 -g -ggdb -c -std=c11 \
+       -D_GNU_SOURCE \
+       -fstack-protector-all -Wformat -Wformat=2 -Wformat-security -fPIE -D_FORTIFY_SOURCE=2 -Wa,--noexecstack \
+       -Wall -Wextra -Werror
+
+LD = gcc
+LDFLAGS += -Wl,-z,now -Wl,-z,relro -pie
+
+SRCS = nsjail.c cmdline.c contain.c log.c net.c subproc.c sandbox.c seccomp/bpf-helper.c
+OBJS = $(SRCS:.c=.o)
+BIN = nsjail
+
+.c.o: %.c
+       $(CC) $(CFLAGS) $< -o $@
+
+all: $(BIN)
+
+$(BIN): $(OBJS)
+       $(LD) -o $(BIN) $(OBJS) $(LDFLAGS)
+
+clean:
+       $(RM) core Makefile.bak $(OBJS) $(BIN)
+
+depend:
+       makedepend -Y. -- $(CFLAGS) -- $(SRCS)
+
+indent:
+       indent -linux -l120 -lc120 -sob -c33 -cp33 *.c *.h; rm -f *~
+
+# DO NOT DELETE THIS LINE -- make depend depends on it.
+
+nsjail.o: nsjail.h cmdline.h common.h log.h net.h subproc.h
+cmdline.o: cmdline.h common.h log.h
+contain.o: contain.h common.h log.h
+log.o: log.h common.h
+net.o: net.h common.h log.h
+subproc.o: subproc.h common.h contain.h log.h net.h sandbox.h
+sandbox.o: sandbox.h common.h log.h seccomp/bpf-helper.h
+seccomp/bpf-helper.o: seccomp/bpf-helper.h
diff --git a/README b/README
new file mode 100644 (file)
index 0000000..5043bb8
--- /dev/null
+++ b/README
@@ -0,0 +1,52 @@
+### WHAT IS IT?
+NsJail is a Linux isolation tool making use of the namespacing and seccomp-bpf
+subsystems of the Linux kernel.
+
+### WHAT KIND OF ISOLATION DOES IT PROVIDE?
+1. Linux namespaces: UTS, MOUNT, PID, IPC, NET, USER (optional)
+2. FS chroot-ing (chroot()/pivot_root())
+3. Seccomp-bpf syscall filters
+
+### WHAT USE-CASES DOES IT COVER?
+1. Isolating networking daemons (inetd-style)
+
+ * Server:
+ $ ./nsjail -Ml --port 9000 --chroot /chroot/ --user 99999 --group 99999 -- /bin/sh -i
+
+ * Client:
+  $ nc 127.0.0.1 9000
+  / $ ifconfig
+  / $ ifconfig -a
+  lo    Link encap:Local Loopback
+        LOOPBACK  MTU:65536  Metric:1
+           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
+           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:0
+           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
+
+
+2. Isolating local processes (run it once, and exit)
+
+ $ ./nsjail -Mo --chroot /chroot/ --user 99999 --group 99999 -- /bin/sh -i
+  / $ ifconfig -a
+  lo    Link encap:Local Loopback
+        LOOPBACK  MTU:65536  Metric:1
+           RX packets:0 errors:0 dropped:0 overruns:0 frame:0
+           TX packets:0 errors:0 dropped:0 overruns:0 carrier:0 collisions:0 txqueuelen:0
+           RX bytes:0 (0.0 B)  TX bytes:0 (0.0 B)
+ / $ id
+ uid=99999 gid=99999
+ / $exit
+ $
+
+3. Isolating local processes (and re-running them)
+
+ $ ./nsjail -Mr --chroot /chroot/ --user 99999 --group 99999 -- /bin/sh -i
+ BusyBox v1.21.1 (Ubuntu 1:1.21.0-1ubuntu1) built-in shell (ash)
+ Enter 'help' for a list of built-in commands.
+ / $ exit
+ BusyBox v1.21.1 (Ubuntu 1:1.21.0-1ubuntu1) built-in shell (ash)
+ Enter 'help' for a list of built-in commands.
+ / $
+
+### MORE INFO?
+Type: './nsjail --help' - cmd-line switches are well-documented
diff --git a/cmdline.c b/cmdline.c
new file mode 100644 (file)
index 0000000..e12d17e
--- /dev/null
+++ b/cmdline.c
@@ -0,0 +1,397 @@
+/*
+
+   nsjail - cmdline parsing
+
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#include "cmdline.h"
+
+#include <ctype.h>
+#include <errno.h>
+#include <getopt.h>
+#include <grp.h>
+#include <limits.h>
+#include <pwd.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <strings.h>
+#include <sys/personality.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "log.h"
+
+struct custom_option {
+       struct option opt;
+       const char *descr;
+};
+
+static const char *logYesNo(bool yes)
+{
+       return (yes ? "true" : "false");
+}
+
+static void cmdlineHelp(const char *pname, struct custom_option *opts)
+{
+       LOG_HELP_BOLD("Usage: %s [options] -- path_to_command [args]", pname);
+       LOG_HELP_BOLD("Options:");
+       for (int i = 0; opts[i].opt.name; i++) {
+               if (isprint(opts[i].opt.val)) {
+                       LOG_HELP_BOLD(" --%s%s%c %s", opts[i].opt.name,
+                                     "|-", opts[i].opt.val, opts[i].opt.has_arg == required_argument ? "[val]" : "");
+               } else {
+                       LOG_HELP_BOLD(" --%s %s", opts[i].opt.name,
+                                     opts[i].opt.has_arg == required_argument ? "[val]" : "");
+               }
+               LOG_HELP("\t%s", opts[i].descr);
+       }
+}
+
+void cmdlineLogParams(struct nsjconf_t *nsjconf)
+{
+       switch (nsjconf->mode) {
+       case MODE_LISTEN_TCP:
+               LOG_I("Mode: LISTEN_TCP");
+               break;
+       case MODE_STANDALONE_ONCE:
+               LOG_I("Mode: STANDALONE_ONCE");
+               break;
+       case MODE_STANDALONE_RERUN:
+               LOG_I("Mode: STANDALONE_RERUN");
+               break;
+       default:
+               LOG_F("Mode: UNKNOWN");
+               break;
+       }
+
+       LOG_I
+           ("Jail parameters: hostname:'%s', chroot:'%s', process:'%s', port:%d, "
+            "max_conns_per_ip:%u, uid:%u, gid:%u, time_limit:%ld, personality:%#lx, daemonize:%s, "
+            "clone_newnet:%s, clone_newuser:%s, clone_newns:%s, clone_newpid:%s, "
+            "clone_newipc:%s, clonew_newuts:%s, apply_sandbox:%s, keep_caps:%s",
+            nsjconf->hostname, nsjconf->chroot, nsjconf->argv[0], nsjconf->port,
+            nsjconf->max_conns_per_ip, nsjconf->uid, nsjconf->gid, nsjconf->tlimit,
+            nsjconf->personality, logYesNo(nsjconf->daemonize), logYesNo(nsjconf->clone_newnet),
+            logYesNo(nsjconf->clone_newuser), logYesNo(nsjconf->clone_newns),
+            logYesNo(nsjconf->clone_newpid), logYesNo(nsjconf->clone_newipc),
+            logYesNo(nsjconf->clone_newuts), logYesNo(nsjconf->apply_sandbox), logYesNo(nsjconf->keep_caps));
+       for (size_t i = 0; i < nsjconf->bindmountpts->fs_count; i++) {
+               LOG_I("Additional bind mount point: '%s'", nsjconf->bindmountpts->mountpt[i]);
+       }
+       for (size_t i = 0; i < nsjconf->tmpfsmountpts->fs_count; i++) {
+               LOG_I("Additional tmpfs mount point: '%s'", nsjconf->tmpfsmountpts->mountpt[i]);
+       }
+}
+
+static void cmdlineUsage(const char *pname, struct custom_option *opts)
+{
+       cmdlineHelp(pname, opts);
+       exit(0);
+}
+
+static bool cmdlineIsANumber(const char *s)
+{
+       for (int i = 0; s[i]; s++) {
+               if (!isdigit(s[i]) && s[i] != 'x') {
+                       return false;
+               }
+       }
+       return true;
+}
+
+rlim_t cmdlineParseRLimit(int res, const char *optarg, unsigned long mul)
+{
+       struct rlimit cur;
+       if (getrlimit(res, &cur) == -1) {
+               PLOG_F("getrlimit(%d)", res);
+       }
+       if (strcasecmp(optarg, "max") == 0) {
+               return cur.rlim_max;
+       }
+       if (strcasecmp(optarg, "def") == 0) {
+               return cur.rlim_cur;
+       }
+       if (cmdlineIsANumber(optarg) == false) {
+               LOG_F("RLIMIT %d needs a numeric or 'max'/'def' value ('%s' provided)", res, optarg);
+       }
+       rlim_t val = strtoul(optarg, NULL, 0) * mul;
+       if (val == ULONG_MAX && errno != 0) {
+               PLOG_F("strtoul('%s', 0)", optarg);
+       }
+       return val;
+}
+
+bool cmdlineParse(int argc, char *argv[], struct nsjconf_t * nsjconf)
+{
+       LIST_INIT(&nsjconf->pids);
+       const char *user = "nobody";
+       const char *group = "nobody";
+       const char *logfile = NULL;
+       nsjconf->bindmountpts = malloc(sizeof(*(nsjconf->bindmountpts)));
+       if (nsjconf->bindmountpts == NULL) {
+               LOG_F("malloc");
+       }
+       nsjconf->bindmountpts->fs_count = 0;
+       nsjconf->tmpfsmountpts = malloc(sizeof(*(nsjconf->bindmountpts)));
+       if (nsjconf->tmpfsmountpts == NULL) {
+               LOG_F("malloc");
+       }
+       nsjconf->tmpfsmountpts->fs_count = 0;
+
+        /*  *INDENT-OFF* */
+       struct custom_option custom_opts[] = {
+               {{"help", no_argument, NULL, 'h'}, "Help plz.."},
+               {{"mode", required_argument, NULL, 'M'}, "Execution mode (default: l [MODE_LISTEN_TCP]):\n"
+                       "\tl: Listen to connections on a TCP port (specified with --port) [MODE_LISTEN_TCP]\n"
+                       "\to: Immediately launch a single process on a console [MODE_STANDALONE_ONCE]\n"
+                       "\tr: Immediately launch a single process on a console, keep doing it forever [MODE_STANDALONE_RERUN]"},
+               {{"chroot", required_argument, NULL, 'c'}, "Directory containing / of the jail (default: '/chroot')"},
+               {{"user", required_argument, NULL, 'u'}, "Username/uid of processess inside the jail (default: 'nobody')"},
+               {{"group", required_argument, NULL, 'g'}, "Groupname/gid of processess inside the jail (default: 'nogroup')"},
+               {{"hostname", required_argument, NULL, 'H'}, "UTS name (hostname) of the jail (default: 'NSJAIL')"},
+               {{"port", required_argument, NULL, 'p'}, "TCP port to bind to (only in [MODE_LISTEN_TCP]) (default: 31337)"},
+               {{"max_conns_per_ip", required_argument, NULL, 'i'}, "Maximum number of connections per one IP (default: 0 (unlimited))"},
+               {{"log", required_argument, NULL, 'l'}, "Log file (default: stderr)"},
+               {{"time_limit", required_argument, NULL, 't'}, "Maximum time that a jail can exist, in seconds (default: 600)"},
+               {{"daemon", no_argument, NULL, 'd'}, "Daemonize after start? (default: false)"},
+               {{"verbose", no_argument, NULL, 'v'}, "Verbose output (default: false)"},
+               {{"keep_env", no_argument, NULL, 'e'}, "Should all environment variables be passed to the child? (default: false)"},
+               {{"keep_caps", no_argument, NULL, 0x0502}, "Don't drop capabilities (DANGEROUS) (default: false)"},
+               {{"rlimit_as", required_argument, NULL, 0x0201}, "RLIMIT_AS in MB, 'max' for RLIM_INFINITY, 'def' for the current value (default: 512)"},
+               {{"rlimit_core", required_argument, NULL, 0x0202}, "RLIMIT_CORE in MB, 'max' for RLIM_INFINITY, 'def' for the current value (default: 0)"},
+               {{"rlimit_cpu", required_argument, NULL, 0x0203}, "RLIMIT_CPU, 'max' for RLIM_INFINITY, 'def' for the current value (default: 600)"},
+               {{"rlimit_fsize", required_argument, NULL, 0x0204}, "RLIMIT_FSIZE in MB, 'max' for RLIM_INFINITY, 'def' for the current value (default: 1)"},
+               {{"rlimit_nofile", required_argument, NULL, 0x0205}, "RLIMIT_NOFILE, 'max' for RLIM_INFINITY, 'def' for the current value (default: 32)"},
+               {{"rlimit_nproc", required_argument, NULL, 0x0206}, "RLIMIT_NPROC, 'max' for RLIM_INFINITY, 'def' for the current value (default: 'def')"},
+               {{"rlimit_stack", required_argument, NULL, 0x0207}, "RLIMIT_STACK in MB, 'max' for RLIM_INFINITY, 'def' for the current value (default: 'def')"},
+               {{"persona_addr_compat_layout", no_argument, NULL, 0x0301}, "personality(ADDR_COMPAT_LAYOUT) (default: false)"},
+               {{"persona_mmap_page_zero", no_argument, NULL, 0x0302}, "personality(MMAP_PAGE_ZERO) (default: false)"},
+               {{"persona_read_implies_exec", no_argument, NULL, 0x0303}, "personality(READ_IMPLIES_EXEC) (default: false)"},
+               {{"persona_addr_limit_3gb", no_argument, NULL, 0x0304}, "personality(ADDR_LIMIT_3GB) (default: false)"},
+               {{"persona_addr_no_randomize", no_argument, NULL, 0x0305}, "personality(ADDR_NO_RANDOMIZE) (default: false)"},
+               {{"disable_clone_newnet", no_argument, NULL, 'N'}, "Enable networking inside the jail (default: false)"},
+               {{"disable_clone_newuser", no_argument, NULL, 0x0402}, "Don't use CLONE_NEWUSER (default: false)"},
+               {{"disable_clone_newns", no_argument, NULL, 0x0403}, "Don't use CLONE_NEWNS (default: false)"},
+               {{"disable_clone_newpid", no_argument, NULL, 0x0404}, "Don't use CLONE_NEWPID (default: false)"},
+               {{"disable_clone_newipc", no_argument, NULL, 0x0405}, "Don't use CLONE_NEWIPC (default: false)"},
+               {{"disable_clone_newuts", no_argument, NULL, 0x0406}, "Don't use CLONE_NEWUTS (default: false)"},
+               {{"disable_sandbox", no_argument, NULL, 0x0501}, "Don't enable the seccomp-bpf sandboxing (default: false)"},
+               {{"rw", no_argument, NULL, 0x0503}, "Mount / as RW (default: RO)"},
+               {{"silent", no_argument, NULL, 0x0504}, "Redirect child's fd:0/1/2 to /dev/null (default: false)"},
+               {{"bindmount", required_argument, NULL, 'B'}, "List of mountpoints to be mounted --bind inside the container. Can be specified multiple times (default: none)"},
+               {{"tmpfsmount", required_argument, NULL, 'T'}, "List of mountpoints to be mounted as tmpfs inside the container. Can be specified multiple times (default: none)"},
+               {{0, 0, 0, 0}, NULL},
+       };
+        /*  *INDENT-ON* */
+
+       struct option opts[ARRAYSIZE(custom_opts)];
+       for (unsigned i = 0; i < ARRAYSIZE(custom_opts); i++) {
+               opts[i] = custom_opts[i].opt;
+       }
+
+       int opt_index = 0;
+       for (;;) {
+               int c = getopt_long(argc, argv, "H:c:p:i:u:g:l:t:M:Ndveh?B:T:", opts, &opt_index);
+               if (c == -1) {
+                       break;
+               }
+               switch (c) {
+               case 'H':
+                       nsjconf->hostname = optarg;
+                       break;
+               case 'c':
+                       nsjconf->chroot = optarg;
+                       break;
+               case 'p':
+                       nsjconf->port = strtoul(optarg, NULL, 0);
+                       break;
+               case 'i':
+                       nsjconf->max_conns_per_ip = strtoul(optarg, NULL, 0);
+                       break;
+               case 'u':
+                       user = optarg;
+                       break;
+               case 'g':
+                       group = optarg;
+                       break;
+               case 'l':
+                       logfile = optarg;
+                       break;
+               case 'd':
+                       nsjconf->daemonize = true;
+                       break;
+               case 'v':
+                       nsjconf->verbose = true;
+                       break;
+               case 'e':
+                       nsjconf->keep_env = true;
+                       break;
+               case 't':
+                       nsjconf->tlimit = strtol(optarg, NULL, 0);
+                       break;
+               case 'h':       /* help */
+               case '?':       /* help */
+                       cmdlineUsage(argv[0], custom_opts);
+                       break;
+               case 0x0201:
+                       nsjconf->rl_as = cmdlineParseRLimit(RLIMIT_AS, optarg, (1024 * 1024));
+                       break;
+               case 0x0202:
+                       nsjconf->rl_core = cmdlineParseRLimit(RLIMIT_CORE, optarg, (1024 * 1024));
+                       break;
+               case 0x0203:
+                       nsjconf->rl_cpu = cmdlineParseRLimit(RLIMIT_CPU, optarg, 1);
+                       break;
+               case 0x0204:
+                       nsjconf->rl_fsize = cmdlineParseRLimit(RLIMIT_FSIZE, optarg, (1024 * 1024));
+                       break;
+               case 0x0205:
+                       nsjconf->rl_nofile = cmdlineParseRLimit(RLIMIT_NOFILE, optarg, 1);
+                       break;
+               case 0x0206:
+                       nsjconf->rl_nproc = cmdlineParseRLimit(RLIMIT_NPROC, optarg, 1);
+                       break;
+               case 0x0207:
+                       nsjconf->rl_stack = cmdlineParseRLimit(RLIMIT_STACK, optarg, (1024 * 1024));
+                       break;
+               case 0x0301:
+                       nsjconf->personality |= ADDR_COMPAT_LAYOUT;
+                       break;
+               case 0x0302:
+                       nsjconf->personality |= MMAP_PAGE_ZERO;
+                       break;
+               case 0x0303:
+                       nsjconf->personality |= READ_IMPLIES_EXEC;
+                       break;
+               case 0x0304:
+                       nsjconf->personality |= ADDR_LIMIT_3GB;
+                       break;
+               case 0x0305:
+                       nsjconf->personality |= ADDR_NO_RANDOMIZE;
+                       break;
+               case 'N':
+                       nsjconf->clone_newnet = false;
+                       break;
+               case 0x0402:
+                       nsjconf->clone_newuser = false;
+                       break;
+               case 0x0403:
+                       nsjconf->clone_newns = false;
+                       break;
+               case 0x0404:
+                       nsjconf->clone_newpid = false;
+                       break;
+               case 0x0405:
+                       nsjconf->clone_newipc = false;
+                       break;
+               case 0x0406:
+                       nsjconf->clone_newuts = false;
+                       break;
+               case 0x0501:
+                       nsjconf->apply_sandbox = false;
+                       break;
+               case 0x0502:
+                       nsjconf->keep_caps = true;
+                       break;
+               case 0x0503:
+                       nsjconf->is_root_rw = true;
+                       break;
+               case 0x0504:
+                       nsjconf->is_silent = true;
+                       break;
+               case 'B':
+                       nsjconf->bindmountpts->fs_count++;
+                       size_t sz =
+                           sizeof(*(nsjconf->bindmountpts)) +
+                           (sizeof(nsjconf->bindmountpts->mountpt[0]) * nsjconf->bindmountpts->fs_count);
+                       if (realloc(nsjconf->bindmountpts, sz) == NULL) {
+                               LOG_F("realloc(%zu)", sz);
+                       }
+                       nsjconf->bindmountpts->mountpt[nsjconf->bindmountpts->fs_count - 1] = optarg;
+                       break;
+               case 'T':
+                       nsjconf->tmpfsmountpts->fs_count++;
+                       sz = sizeof(*(nsjconf->tmpfsmountpts)) +
+                           (sizeof(nsjconf->tmpfsmountpts->mountpt[0]) * nsjconf->tmpfsmountpts->fs_count);
+                       if (realloc(nsjconf->tmpfsmountpts, sz) == NULL) {
+                               LOG_F("realloc(%zu)", sz);
+                       }
+                       nsjconf->tmpfsmountpts->mountpt[nsjconf->tmpfsmountpts->fs_count - 1] = optarg;
+                       break;
+               case 'M':
+                       switch (optarg[0]) {
+                       case 'l':
+                               nsjconf->mode = MODE_LISTEN_TCP;
+                               break;
+                       case 'o':
+                               nsjconf->mode = MODE_STANDALONE_ONCE;
+                               break;
+                       case 'r':
+                               nsjconf->mode = MODE_STANDALONE_RERUN;
+                               break;
+                       default:
+                               LOG_E("Modes supported: -M l - MODE_LISTEN_TCP (default)");
+                               LOG_E("                 -M o - MODE_STANDALONE_ONCE");
+                               LOG_E("                 -M r - MODE_STANDALONE_RERUN");
+                               cmdlineUsage(argv[0], custom_opts);
+                               return false;
+                               break;
+                       }
+                       break;
+               default:
+                       cmdlineUsage(argv[0], custom_opts);
+                       return false;
+                       break;
+               }
+       }
+
+       if (logInitLogFile(nsjconf, logfile, nsjconf->verbose) == false) {
+               return false;
+       }
+
+       nsjconf->argv = &argv[optind];
+       if (nsjconf->argv[0] == NULL) {
+               LOG_E("No command provided");
+               cmdlineUsage(argv[0], custom_opts);
+               return false;
+       }
+
+       struct passwd *pw = getpwnam(user);
+       if (pw != NULL) {
+               nsjconf->uid = pw->pw_uid;
+       } else if (cmdlineIsANumber(user)) {
+               nsjconf->uid = (uid_t) strtoull(user, NULL, 0);
+       } else {
+               LOG_E("No such user '%s'", user);
+               return false;
+       }
+       struct group *gr = getgrnam(group);
+       if (gr != NULL) {
+               nsjconf->gid = gr->gr_gid;
+       } else if (cmdlineIsANumber(group)) {
+               nsjconf->gid = (gid_t) strtoull(group, NULL, 0);
+       } else {
+               LOG_E("No such group '%s'", group);
+               return false;
+       }
+
+       return true;
+}
diff --git a/cmdline.h b/cmdline.h
new file mode 100644 (file)
index 0000000..2cd4dc7
--- /dev/null
+++ b/cmdline.h
@@ -0,0 +1,34 @@
+/*
+
+   nsjail - cmdline parsing
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#ifndef _CMDLINE_H
+#define _CMDLINE_H
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include "common.h"
+
+rlim_t cmdlineParseRLimit(int res, const char *optarg, unsigned long mul);
+void cmdlineLogParams(struct nsjconf_t *nsjconf);
+bool cmdlineParse(int argc, char *argv[], struct nsjconf_t *nsjconf);
+
+#endif                         /* _CMDLINE_H */
diff --git a/common.h b/common.h
new file mode 100644 (file)
index 0000000..8740d80
--- /dev/null
+++ b/common.h
@@ -0,0 +1,90 @@
+/*
+
+   nsjail - common structures
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#ifndef _COMMON_H
+#define _COMMON_H
+
+#include <netinet/ip6.h>
+#include <stdbool.h>
+#include <sys/queue.h>
+#include <sys/resource.h>
+#include <sys/types.h>
+
+#define ARRAYSIZE(array) (sizeof(array) / sizeof(*array))
+
+struct pids_t {
+       pid_t pid;
+       time_t start;
+       char remote_txt[64];
+       struct sockaddr_in6 remote_addr;
+        LIST_ENTRY(pids_t) pointers;
+};
+
+enum mode_t {
+       MODE_LISTEN_TCP = 0,
+       MODE_STANDALONE_ONCE,
+       MODE_STANDALONE_RERUN
+};
+
+struct mountfs_t {
+       size_t fs_count;
+       char *mountpt[];
+};
+
+struct nsjconf_t {
+       const char *hostname;
+       const char *chroot;
+       char *const *argv;
+       int port;
+       uid_t uid;
+       gid_t gid;
+       bool daemonize;
+       time_t tlimit;
+       bool apply_sandbox;
+       bool verbose;
+       bool keep_env;
+       bool keep_caps;
+       rlim_t rl_as;
+       rlim_t rl_core;
+       rlim_t rl_cpu;
+       rlim_t rl_fsize;
+       rlim_t rl_nofile;
+       rlim_t rl_nproc;
+       rlim_t rl_stack;
+       unsigned long personality;
+       bool clone_newnet;
+       bool clone_newuser;
+       bool clone_newns;
+       bool clone_newpid;
+       bool clone_newipc;
+       bool clone_newuts;
+       enum mode_t mode;
+       bool is_root_rw;
+       bool is_silent;
+       struct mountfs_t *bindmountpts;
+       struct mountfs_t *tmpfsmountpts;
+       uid_t initial_uid;
+       gid_t initial_gid;
+       unsigned int max_conns_per_ip;
+        LIST_HEAD(pidslist, pids_t) pids;
+};
+
+#endif                         /* _COMMON_H */
diff --git a/contain.c b/contain.c
new file mode 100644 (file)
index 0000000..7774756
--- /dev/null
+++ b/contain.c
@@ -0,0 +1,375 @@
+/*
+
+   nsjail - isolating the binary
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#include "contain.h"
+
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <grp.h>
+#include <linux/capability.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/mount.h>
+#include <sys/personality.h>
+#include <sys/prctl.h>
+#include <sys/resource.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "log.h"
+
+static bool containSetGroups(void)
+{
+       int fd = open("/proc/self/setgroups", O_WRONLY | O_CLOEXEC);
+       if (fd == -1) {
+               /* Not present with all kernels */
+               PLOG_D("'/proc/self/setgroups' not present in this kernel?");
+               return true;
+       }
+       const char *denystr = "deny";
+       if (write(fd, denystr, strlen(denystr)) == -1) {
+               PLOG_E("write('/proc/self/setgroups', '%s') failed", denystr);
+               close(fd);
+               return false;
+       }
+       close(fd);
+       return true;
+}
+
+static bool containUidGidMap(struct nsjconf_t *nsjconf, uid_t uid, gid_t gid)
+{
+       if (nsjconf->clone_newuser == false) {
+               return true;
+       }
+
+       int fd;
+       char map[64];
+       if ((fd = open("/proc/self/uid_map", O_WRONLY | O_CLOEXEC)) == -1) {
+               PLOG_E("open('/proc/self/uid_map', O_WRONLY | O_CLOEXEC)");
+               return false;
+       }
+       snprintf(map, sizeof(map), "%lu %lu 1", (unsigned long)uid, (unsigned long)nsjconf->initial_uid);
+       LOG_D("Writing '%s' to /proc/self/uid_map", map);
+       if (write(fd, map, strlen(map)) == -1) {
+               PLOG_E("write('/proc/self/uid_map', %d, '%s')", fd, map);
+               close(fd);
+               return false;
+       }
+       close(fd);
+
+       if ((fd = open("/proc/self/gid_map", O_WRONLY | O_CLOEXEC)) == -1) {
+               PLOG_E("open('/proc/self/gid_map', O_WRONLY | O_CLOEXEC)");
+               return false;
+       }
+       snprintf(map, sizeof(map), "%lu %lu 1", (unsigned long)gid, (unsigned long)nsjconf->initial_gid);
+       LOG_D("Writing '%s' to /proc/self/gid_map", map);
+       if (write(fd, map, strlen(map)) == -1) {
+               PLOG_E("write('/proc/self/gid_map', %d, '%s')", fd, map);
+               close(fd);
+               return false;
+       }
+       close(fd);
+       return true;
+}
+
+bool containDropPrivs(struct nsjconf_t * nsjconf)
+{
+       if (containSetGroups() == false) {
+               return false;
+       }
+       if (containUidGidMap(nsjconf, nsjconf->uid, nsjconf->gid) == false) {
+               return false;
+       }
+       /*
+        * Best effort because of /proc/self/setgroups
+        */
+       gid_t *group_list = NULL;
+       if (setgroups(0, group_list) == -1) {
+               PLOG_D("setgroups(NULL) failed");
+       }
+       if (setresgid(nsjconf->gid, nsjconf->gid, nsjconf->gid) == -1) {
+               PLOG_E("setresgid(%u)", nsjconf->gid);
+               return false;
+       }
+       if (setresuid(nsjconf->uid, nsjconf->uid, nsjconf->uid) == -1) {
+               PLOG_E("setresuid(%u)", nsjconf->uid);
+               return false;
+       }
+#ifndef PR_SET_NO_NEW_PRIVS
+#define PR_SET_NO_NEW_PRIVS 38
+#endif
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0) == -1) {
+               /* Only new kernels support it */
+               PLOG_W("prctl(PR_SET_NO_NEW_PRIVS, 1)");
+       }
+
+       if (nsjconf->keep_caps == false) {
+               if (prctl(PR_SET_KEEPCAPS, 0, 0, 0, 0) == -1) {
+                       PLOG_E("prctl(PR_SET_KEEPCAPS, 0)");
+                       return false;
+               }
+               struct __user_cap_header_struct cap_hdr = {
+                       .version = _LINUX_CAPABILITY_VERSION_3,
+                       .pid = 0,
+               };
+               struct __user_cap_data_struct cap_data[_LINUX_CAPABILITY_U32S_3] = {
+                       [0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].inheritable = 0U,
+                       [0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].effective = 0U,
+                       [0 ... (_LINUX_CAPABILITY_U32S_3 - 1)].permitted = 0U,
+               };
+               if (syscall(__NR_capset, &cap_hdr, &cap_data) == -1) {
+                       PLOG_E("capset()");
+                       return false;
+               }
+       }
+       return true;
+}
+
+bool containPrepareEnv(struct nsjconf_t * nsjconf)
+{
+       LOG_D("Setting hostname to '%s'", nsjconf->hostname);
+       if (nsjconf->clone_newuts) {
+               if (sethostname(nsjconf->hostname, strlen(nsjconf->hostname)) == -1) {
+                       PLOG_E("sethostname('%s')", nsjconf->hostname);
+                       return false;
+               }
+       }
+       if (prctl(PR_SET_PDEATHSIG, SIGKILL, 0, 0, 0) == -1) {
+               PLOG_E("prctl(PR_SET_PDEATHSIG, SIGKILL)");
+               return false;
+       }
+       if (nsjconf->personality && personality(nsjconf->personality) == -1) {
+               PLOG_E("personality(%lx)", nsjconf->personality);
+               return false;
+       }
+       errno = 0;
+       if (setpriority(PRIO_PROCESS, 0, 19) == -1 && errno != 0) {
+               PLOG_W("setpriority(19)");
+       }
+       return true;
+}
+
+bool containMountFS(struct nsjconf_t * nsjconf)
+{
+       const char *destdir = "/tmp";
+       if (mount("none", destdir, "tmpfs", 0, NULL) == -1) {
+               PLOG_E("mount('%s', 'tmpfs'", destdir);
+               return false;
+       }
+       char newrootdir[PATH_MAX];
+       snprintf(newrootdir, sizeof(newrootdir), "%s/%s", destdir, "new_root");
+       if (mkdir(newrootdir, 0755) == -1) {
+               PLOG_E("mkdir(/tmp/new_root");
+               return false;
+       }
+       if (mount(nsjconf->chroot, newrootdir, NULL, MS_BIND | MS_REC, NULL) == -1) {
+               PLOG_E("mount('%s', '%s', MS_BIND | MS_REC)", nsjconf->chroot, newrootdir);
+               return false;
+       }
+
+       char mount_pt[PATH_MAX];
+       for (size_t i = 0; i < nsjconf->bindmountpts->fs_count; i++) {
+               snprintf(mount_pt, sizeof(mount_pt), "%s/%s", newrootdir, nsjconf->bindmountpts->mountpt[i]);
+               if (mkdir(mount_pt, 0700) == -1 && errno != EEXIST) {
+                       PLOG_E("mkdir('%s')", mount_pt);
+                       return false;
+               }
+               LOG_D("Mounting (bind) '%s' on '%s'", nsjconf->bindmountpts->mountpt[i], mount_pt);
+               if (mount(nsjconf->bindmountpts->mountpt[i], mount_pt, NULL, MS_BIND | MS_REC, NULL)
+                   == -1) {
+                       PLOG_E("mount('%s', '%s', MS_BIND|MS_REC", nsjconf->bindmountpts->mountpt[i], mount_pt);
+                       return false;
+               }
+       }
+
+       char pivotrootdir[PATH_MAX];
+       snprintf(pivotrootdir, sizeof(pivotrootdir), "%s/%s", destdir, "pivot_root");
+       if (mkdir(pivotrootdir, 0755) == -1) {
+               PLOG_E("mkdir('%s')", pivotrootdir);
+               return false;
+       }
+       if (syscall(__NR_pivot_root, destdir, pivotrootdir) == -1) {
+               PLOG_E("pivot_root('%s', '%s')", destdir, pivotrootdir);
+               return false;
+       }
+
+       char procrootdir[PATH_MAX] = "/new_root/proc";
+       if (mount(NULL, procrootdir, "proc", MS_NOSUID | MS_NOEXEC | MS_NODEV, NULL) == -1) {
+               PLOG_E("mount('%s', 'proc')", procrootdir);
+               return false;
+       }
+       if (umount2("/pivot_root", MNT_DETACH) == -1) {
+               PLOG_E("umount2('/pivot_root', MNT_DETACH)");
+               return false;
+       }
+       if (chroot("/new_root") == -1) {
+               PLOG_E("CHROOT('/new_root')");
+               return false;
+       }
+
+       if (chdir("/") == -1) {
+               PLOG_E("chdir('/')");
+               return false;
+       }
+
+       for (size_t i = 0; i < nsjconf->tmpfsmountpts->fs_count; i++) {
+               if (mkdir(nsjconf->tmpfsmountpts->mountpt[i], 0700) == -1 && errno != EEXIST) {
+                       PLOG_E("mkdir('%s')", nsjconf->tmpfsmountpts->mountpt[i]);
+                       return false;
+               }
+               LOG_D("Mounting (tmpfs) '%s'", nsjconf->tmpfsmountpts->mountpt[i]);
+               if (mount(NULL, nsjconf->tmpfsmountpts->mountpt[i], "tmpfs", 0, "size=4194304")
+                   == -1) {
+                       PLOG_E("mount('%s', 'tmpfs')", nsjconf->tmpfsmountpts->mountpt[i]);
+                       return false;
+               }
+       }
+
+       if (nsjconf->is_root_rw == false) {
+               if (mount
+                   ("/", "/", NULL, MS_BIND | MS_RDONLY | MS_NOSUID | MS_NODEV | MS_REMOUNT | MS_PRIVATE,
+                    NULL) == -1) {
+                       PLOG_E("mount('/', '/', MS_BIND|MS_RDONLY|MS_NOSUID|MS_NODEV|MS_REMOUNT|MS_PRIVATE)");
+                       return false;
+               }
+       }
+
+       return true;
+}
+
+bool containSetLimits(struct nsjconf_t * nsjconf)
+{
+       struct rlimit rl;
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_as;
+       if (setrlimit(RLIMIT_AS, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_AS, %lu)", nsjconf->rl_as);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_core;
+       if (setrlimit(RLIMIT_CORE, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_CORE, %lu)", nsjconf->rl_core);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_cpu;
+       if (setrlimit(RLIMIT_CPU, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_CPU), %lu", nsjconf->rl_cpu);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_fsize;
+       if (setrlimit(RLIMIT_FSIZE, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_FSIZE), %lu", nsjconf->rl_fsize);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_nofile;
+       if (setrlimit(RLIMIT_NOFILE, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_NOFILE), %lu", nsjconf->rl_nofile);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_nproc;
+       if (setrlimit(RLIMIT_NPROC, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_NPROC), %lu", nsjconf->rl_nproc);
+               return false;
+       }
+       rl.rlim_cur = rl.rlim_max = nsjconf->rl_stack;
+       if (setrlimit(RLIMIT_STACK, &rl) == -1) {
+               PLOG_E("setrlimit(RLIMIT_STACK), %lu", nsjconf->rl_stack);
+               return false;
+       }
+       return true;
+}
+
+bool containMakeFdsCOE(void)
+{
+       /* Make all fds above stderr close-on-exec */
+       DIR *dir = opendir("/proc/self/fd");
+       if (dir == NULL) {
+               PLOG_E("opendir('/proc/self/fd')");
+               return false;
+       }
+       for (;;) {
+               errno = 0;
+               struct dirent *entry = readdir(dir);
+               if (entry == NULL && errno != 0) {
+                       PLOG_E("readdir('/proc/self/fd')");
+                       closedir(dir);
+                       return false;
+               }
+               if (entry == NULL) {
+                       break;
+               }
+               if (strcmp(".", entry->d_name) == 0) {
+                       continue;
+               }
+               if (strcmp("..", entry->d_name) == 0) {
+                       continue;
+               }
+               int fd = strtoul(entry->d_name, NULL, 10);
+               if (errno == EINVAL) {
+                       LOG_W("Cannot convert /proc/self/fd/%s to a number", entry->d_name);
+                       continue;
+               }
+               if (fd > STDERR_FILENO) {
+                       int flags = fcntl(fd, F_GETFD, 0);
+                       if (flags == -1) {
+                               PLOG_E("fcntl(fd, F_GETFD, 0)");
+                               return false;
+                       }
+                       fcntl(fd, F_SETFD, flags | FD_CLOEXEC);
+                       LOG_D("Set fd '%d' flag to FD_CLOEXEC", fd);
+               }
+       }
+       closedir(dir);
+       return true;
+}
+
+bool containSetupFD(struct nsjconf_t * nsjconf, int fd_in, int fd_out, int fd_err)
+{
+       if (nsjconf->mode != MODE_LISTEN_TCP) {
+               if (nsjconf->is_silent == false) {
+                       return true;
+               }
+               if ((fd_in = fd_out = fd_err = open("/dev/null", O_RDWR)) == -1) {
+                       PLOG_E("open('/dev/null', O_RDWR)");
+                       return false;
+               }
+       }
+       /* Set stdin/stdout/stderr to the net */
+       if (dup2(fd_in, STDIN_FILENO) == -1) {
+               PLOG_E("dup2(%d, STDIN_FILENO)", fd_in);
+               return false;
+       }
+       if (dup2(fd_out, STDOUT_FILENO) == -1) {
+               PLOG_E("dup2(%d, STDOUT_FILENO)", fd_out);
+               return false;
+       }
+       if (dup2(fd_err, STDERR_FILENO) == -1) {
+               PLOG_E("dup2(%d, STDERR_FILENO)", fd_err);
+               return false;
+       }
+       return true;
+}
diff --git a/contain.h b/contain.h
new file mode 100644 (file)
index 0000000..c61436d
--- /dev/null
+++ b/contain.h
@@ -0,0 +1,36 @@
+/*
+
+   nsjail - isolating the binary
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#ifndef _CONTAIN_H
+#define _CONTAIN_H
+
+#include <stdbool.h>
+
+#include "common.h"
+
+bool containDropPrivs(struct nsjconf_t *nsjconf);
+bool containPrepareEnv(struct nsjconf_t *nsjconf);
+bool containMountFS(struct nsjconf_t *nsjconf);
+bool containSetLimits(struct nsjconf_t *nsjconf);
+bool containMakeFdsCOE(void);
+bool containSetupFD(struct nsjconf_t *nsjconf, int fd_in, int fd_out, int fd_err);
+
+#endif                         /* _CONTAIN_H */
diff --git a/log.c b/log.c
new file mode 100644 (file)
index 0000000..8f73ef0
--- /dev/null
+++ b/log.c
@@ -0,0 +1,128 @@
+/*
+
+   nsjail - logging
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#include "log.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+static __thread int log_fd = STDERR_FILENO;
+static __thread bool log_fd_isatty = true;
+static __thread bool log_verbose = false;
+
+#define _LOG_DEFAULT_FILE "/var/log/nsjail.log"
+
+/*
+ * Log to stderr by default. Use a dup()d fd, because in the future we'll associate the
+ * connection socket with fd (0, 1, 2).
+ */
+bool logInitLogFile(struct nsjconf_t *nsjconf, const char *logfile, bool is_verbose)
+{
+       log_verbose = is_verbose;
+
+       if (logfile == NULL && nsjconf->daemonize == true) {
+               logfile = _LOG_DEFAULT_FILE;
+       }
+       if (logfile == NULL) {
+               logfile = "/dev/tty";
+       }
+       log_fd = open(logfile, O_CREAT | O_RDWR | O_APPEND, 0640);
+       if (log_fd == -1) {
+               log_fd = STDERR_FILENO;
+               PLOG_E("Couldn't open logfile open('%s')", logfile);
+               return false;
+       }
+       log_fd_isatty = (isatty(log_fd) == 1 ? true : false);
+       return true;
+}
+
+void logLog(enum llevel_t ll, const char *fn, int ln, bool perr, const char *fmt, ...)
+{
+       if (ll == DEBUG && !log_verbose) {
+               return;
+       }
+
+       char strerr[512];
+       if (perr == true) {
+               snprintf(strerr, sizeof(strerr), "%s", strerror(errno));
+       }
+       struct ll_t {
+               char *descr;
+               char *prefix;
+               bool print_funcline;
+       };
+       struct ll_t logLevels[] = {
+               {"HR", "\033[0m", false},
+               {"HB", "\033[1m", false},
+               {"D", "\033[0;4m", true},
+               {"I", "\033[1m", true},
+               {"W", "\033[0;33m", true},
+               {"E", "\033[1;31m", true},
+               {"F", "\033[7;35m", true},
+       };
+
+       time_t ltstamp = time(NULL);
+       struct tm utctime;
+       localtime_r(&ltstamp, &utctime);
+       char timestr[32];
+       if (strftime(timestr, sizeof(timestr) - 1, "%FT%T%z", &utctime) == 0) {
+               timestr[0] = '\0';
+       }
+
+       /* Start printing logs */
+       if (log_fd_isatty) {
+               dprintf(log_fd, "%s", logLevels[ll].prefix);
+       }
+       if (logLevels[ll].print_funcline) {
+               dprintf(log_fd, "[%s][%s][%d] %s():%d ", timestr, logLevels[ll].descr, getpid(), fn, ln);
+       }
+
+       va_list args;
+       va_start(args, fmt);
+       vdprintf(log_fd, fmt, args);
+       va_end(args);
+       if (perr == true) {
+               dprintf(log_fd, ": %s", strerr);
+       }
+       if (log_fd_isatty) {
+               dprintf(log_fd, "\033[0m");
+       }
+       dprintf(log_fd, "\n");
+       /* End printing logs */
+
+       if (ll == FATAL) {
+               exit(1);
+       }
+}
+
+void logStop(int sig)
+{
+       LOG_I("Server stops due to fatal signal (%d) caught. Exiting", sig);
+}
diff --git a/log.h b/log.h
new file mode 100644 (file)
index 0000000..0de3c4d
--- /dev/null
+++ b/log.h
@@ -0,0 +1,59 @@
+/*
+
+   nsjail - logging
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#ifndef _LOG_H
+#define _LOG_H
+
+#include <getopt.h>
+#include <stdbool.h>
+
+#include "common.h"
+
+#define LOG_HELP(...) logLog(HELP, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_HELP_BOLD(...) logLog(HELP_BOLD, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+
+#define LOG_D(...) logLog(DEBUG, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_I(...) logLog(INFO, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_W(...) logLog(WARNING, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_E(...) logLog(ERROR, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+#define LOG_F(...) logLog(FATAL, __FUNCTION__, __LINE__, false, __VA_ARGS__);
+
+#define PLOG_D(...) logLog(DEBUG, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_I(...) logLog(INFO, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_W(...) logLog(WARNING, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_E(...) logLog(ERROR, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+#define PLOG_F(...) logLog(FATAL, __FUNCTION__, __LINE__, true, __VA_ARGS__);
+
+enum llevel_t {
+       HELP = 0,
+       HELP_BOLD,
+       DEBUG,
+       INFO,
+       WARNING,
+       ERROR,
+       FATAL
+};
+
+bool logInitLogFile(struct nsjconf_t *nsjconf, const char *logfile, bool is_verbose);
+void logLog(enum llevel_t ll, const char *fn, int ln, bool perr, const char *fmt, ...)
+    __attribute__ ((format(printf, 5, 6)));
+void logStop(int sig);
+
+#endif                         /* _LOG_H */
diff --git a/net.c b/net.c
new file mode 100644 (file)
index 0000000..fa1cd28
--- /dev/null
+++ b/net.c
@@ -0,0 +1,178 @@
+/*
+
+   nsjail - networking routines
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#include "net.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <netinet/ip6.h>
+#include <netinet/tcp.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "log.h"
+
+static bool netIsSocket(int fd)
+{
+       int optval;
+       socklen_t optlen = sizeof(optval);
+       int ret = getsockopt(fd, SOL_SOCKET, SO_TYPE, &optval, &optlen);
+       if (ret == -1) {
+               return false;
+       }
+       return true;
+}
+
+bool netLimitConns(struct nsjconf_t * nsjconf, int connsock)
+{
+       /* 0 means 'unlimited' */
+       if (nsjconf->max_conns_per_ip == 0) {
+               return true;
+       }
+
+       struct sockaddr_in6 addr;
+       char cs_addr[64];
+       netConnToText(connsock, true /* remote */ , cs_addr, sizeof(cs_addr), &addr);
+
+       unsigned int cnt = 0;
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               if (memcmp
+                   (addr.sin6_addr.s6_addr, p->remote_addr.sin6_addr.s6_addr,
+                    sizeof(*p->remote_addr.sin6_addr.s6_addr)) == 0) {
+                       cnt++;
+               }
+       }
+
+       if (cnt >= nsjconf->max_conns_per_ip) {
+               LOG_W("Rejecting connection from '%s', max_conns_per_ip limit reached: %u", cs_addr,
+                     nsjconf->max_conns_per_ip);
+               return false;
+       }
+
+       return true;
+}
+
+int netGetRecvSocket(int port)
+{
+       if (port < 1 || port > 65535) {
+               LOG_F("TCP port %d out of bounds (0 <= port <= 65535)", port);
+       }
+
+       int sockfd = socket(AF_INET6, SOCK_STREAM, 0);
+       if (sockfd == -1) {
+               PLOG_E("socket(AF_INET6)");
+               return -1;
+       }
+       int so = 1;
+       if (setsockopt(sockfd, SOL_SOCKET, SO_REUSEADDR, &so, sizeof(so)) == -1) {
+               PLOG_E("setsockopt(%d, SO_REUSEADDR)", sockfd);
+               return -1;
+       }
+       struct sockaddr_in6 addr = {
+               .sin6_family = AF_INET6,
+               .sin6_port = htons(port),
+               .sin6_flowinfo = 0,
+               .sin6_addr = in6addr_any,
+               .sin6_scope_id = 0,
+       };
+       if (bind(sockfd, (struct sockaddr *)&addr, sizeof(addr)) == -1) {
+               PLOG_E("bind(port:%d)", port);
+               return -1;
+       }
+       if (listen(sockfd, SOMAXCONN) == -1) {
+               PLOG_E("listen(%d)", SOMAXCONN);
+               return -1;
+       }
+
+       char ss_addr[64];
+       netConnToText(sockfd, false /* remote */ , ss_addr, sizeof(ss_addr), NULL);
+       LOG_I("Listening on %s", ss_addr);
+
+       return sockfd;
+}
+
+int netAcceptConn(int listenfd)
+{
+       struct sockaddr_in6 cli_addr;
+       socklen_t socklen = sizeof(cli_addr);
+       int connfd = accept(listenfd, (struct sockaddr *)&cli_addr, &socklen);
+       if (connfd == -1) {
+               if (errno != EINTR) {
+                       PLOG_E("accept(%d)", listenfd);
+               }
+               return -1;
+       }
+
+       char cs_addr[64], ss_addr[64];
+       netConnToText(connfd, true /* remote */ , cs_addr, sizeof(cs_addr), NULL);
+       netConnToText(connfd, false /* remote */ , ss_addr, sizeof(ss_addr), NULL);
+       LOG_I("New connection from: %s on: %s", cs_addr, ss_addr);
+
+       int so = 1;
+       if (setsockopt(connfd, SOL_TCP, TCP_CORK, &so, sizeof(so)) == -1) {
+               PLOG_W("setsockopt(%d, TCP_CORK)", connfd);
+       }
+       return connfd;
+}
+
+void netConnToText(int fd, bool remote, char *buf, size_t s, struct sockaddr_in6 *addr_or_null)
+{
+       if (netIsSocket(fd) == false) {
+               snprintf(buf, s, "[STANDALONE_MODE]");
+               return;
+       }
+
+       struct sockaddr_in6 addr;
+       socklen_t addrlen = sizeof(addr);
+       if (remote) {
+               if (getpeername(fd, (struct sockaddr *)&addr, &addrlen) == -1) {
+                       PLOG_W("getpeername(%d)", fd);
+                       snprintf(buf, s, "[unknown]");
+                       return;
+               }
+       } else {
+               if (getsockname(fd, (struct sockaddr *)&addr, &addrlen) == -1) {
+                       PLOG_W("getsockname(%d)", fd);
+                       snprintf(buf, s, "[unknown]");
+                       return;
+               }
+       }
+
+       if (addr_or_null) {
+               memcpy(addr_or_null, &addr, sizeof(*addr_or_null));
+       }
+
+       char tmp[s];
+       if (inet_ntop(AF_INET6, addr.sin6_addr.s6_addr, tmp, s) == NULL) {
+               PLOG_W("inet_ntop()");
+               snprintf(buf, s, "[unknown]:%hu", ntohs(addr.sin6_port));
+               return;
+       }
+       snprintf(buf, s, "%s:%hu", tmp, ntohs(addr.sin6_port));
+       return;
+}
diff --git a/net.h b/net.h
new file mode 100644 (file)
index 0000000..b136d10
--- /dev/null
+++ b/net.h
@@ -0,0 +1,34 @@
+/*
+
+   nsjail - networking routines
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#ifndef _NET_H
+#define _NET_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "common.h"
+
+bool netLimitConns(struct nsjconf_t *nsjconf, int connsock);
+int netGetRecvSocket(int port);
+int netAcceptConn(int listenfd);
+void netConnToText(int fd, bool remote, char *buf, size_t s, struct sockaddr_in6 *addr_or_null);
+
+#endif                         /* _NET_H */
diff --git a/nsjail.c b/nsjail.c
new file mode 100644 (file)
index 0000000..3f327c8
--- /dev/null
+++ b/nsjail.c
@@ -0,0 +1,218 @@
+/*
+
+   nsjail
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#include "nsjail.h"
+
+#include <errno.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/time.h>
+#include <unistd.h>
+
+#include "cmdline.h"
+#include "common.h"
+#include "log.h"
+#include "net.h"
+#include "subproc.h"
+
+static __thread int nsjailSigFatal = 0;
+static __thread bool nsjailShowProc = false;
+
+static void nsjailSig(int sig)
+{
+       if (sig == SIGALRM) {
+               return;
+       }
+       if (sig == SIGCHLD) {
+               return;
+       }
+       if (sig == SIGUSR1) {
+               nsjailShowProc = true;
+               return;
+       }
+       nsjailSigFatal = sig;
+}
+
+static bool nsjailSetSigHandler(int sig)
+{
+       LOG_D("Setting sighandler for signal '%d' (%s)", sig, strsignal(sig));
+
+       sigset_t smask;
+       sigemptyset(&smask);
+       struct sigaction sa = {
+               .sa_handler = nsjailSig,
+               .sa_mask = smask,
+               .sa_flags = 0,
+               .sa_restorer = NULL,
+       };
+       if (sigaction(sig, &sa, NULL) == -1) {
+               PLOG_E("sigaction(%d)", sig);
+               return false;
+       }
+       return true;
+}
+
+static bool nsjailSetSigHandlers(void)
+{
+       if (nsjailSetSigHandler(SIGINT) == false) {
+               return false;
+       }
+       if (nsjailSetSigHandler(SIGUSR1) == false) {
+               return false;
+       }
+       if (nsjailSetSigHandler(SIGALRM) == false) {
+               return false;
+       }
+       if (nsjailSetSigHandler(SIGCHLD) == false) {
+               return false;
+       }
+       if (nsjailSetSigHandler(SIGTERM) == false) {
+               return false;
+       }
+       return true;
+}
+
+static bool nsjailSetTimer(void)
+{
+       struct itimerval it = {
+               .it_value = {.tv_sec = 1,.tv_usec = 0},
+               .it_interval = {.tv_sec = 1,.tv_usec = 0},
+       };
+       if (setitimer(ITIMER_REAL, &it, NULL) == -1) {
+               PLOG_E("setitimer(ITIMER_REAL)");
+               return false;
+       }
+       return true;
+}
+
+static void nsjailListenMode(struct nsjconf_t *nsjconf)
+{
+       int listenfd = netGetRecvSocket(nsjconf->port);
+       if (listenfd == -1) {
+               return;
+       }
+       for (;;) {
+               if (nsjailSigFatal > 0) {
+                       subprocKillAll(nsjconf);
+                       logStop(nsjailSigFatal);
+                       return;
+               }
+               if (nsjailShowProc == true) {
+                       nsjailShowProc = false;
+                       subprocDisplay(nsjconf);
+               }
+               int connfd = netAcceptConn(listenfd);
+               if (connfd >= 0) {
+                       subprocRunChild(nsjconf, connfd, connfd, connfd);
+                       close(connfd);
+               }
+               subprocReap(nsjconf);
+       }
+}
+
+static void nsjailStandaloneMode(struct nsjconf_t *nsjconf)
+{
+       subprocRunChild(nsjconf, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO);
+       for (;;) {
+               if (subprocCount(nsjconf) == 0) {
+                       if (nsjconf->mode == MODE_STANDALONE_ONCE) {
+                               return;
+                       }
+                       subprocRunChild(nsjconf, STDIN_FILENO, STDOUT_FILENO, STDERR_FILENO);
+               }
+               if (nsjailShowProc == true) {
+                       nsjailShowProc = false;
+                       subprocDisplay(nsjconf);
+               }
+               if (nsjailSigFatal > 0) {
+                       subprocKillAll(nsjconf);
+                       logStop(nsjailSigFatal);
+                       return;
+               }
+               pause();
+               subprocReap(nsjconf);
+       }
+}
+
+int main(int argc, char *argv[])
+{
+       struct nsjconf_t nsjconf = {
+               .hostname = "NSJAIL",
+               .chroot = "/chroot",
+               .argv = NULL,
+               .port = 31337,
+               .uid = -1,
+               .gid = -1,
+               .daemonize = false,
+               .tlimit = 0,
+               .apply_sandbox = true,
+               .verbose = false,
+               .keep_caps = false,
+               .rl_as = 512 * (1024 * 1024),
+               .rl_core = 0,
+               .rl_cpu = 600,
+               .rl_fsize = 1 * (1024 * 1024),
+               .rl_nofile = 32,
+               .rl_nproc = cmdlineParseRLimit(RLIMIT_NPROC, "def", 1),
+               .rl_stack = cmdlineParseRLimit(RLIMIT_STACK, "def", 1),
+               .personality = 0,
+               .clone_newnet = true,
+               .clone_newuser = true,
+               .clone_newns = true,
+               .clone_newpid = true,
+               .clone_newipc = true,
+               .clone_newuts = true,
+               .mode = MODE_LISTEN_TCP,
+               .is_root_rw = false,
+               .is_silent = false,
+               .bindmountpts = NULL,
+               .tmpfsmountpts = NULL,
+               .initial_uid = getuid(),
+               .initial_gid = getgid(),
+               .max_conns_per_ip = 0,
+       };
+
+       if (!cmdlineParse(argc, argv, &nsjconf)) {
+               exit(1);
+       }
+       if (nsjconf.clone_newuser == false && geteuid() != 0) {
+               LOG_E("--disable_clone_newuser requires root() privs");
+       }
+       if (nsjconf.daemonize && (daemon(0, 0) == -1)) {
+               PLOG_F("daemon");
+       }
+       cmdlineLogParams(&nsjconf);
+       if (nsjailSetSigHandlers() == false) {
+               exit(1);
+       }
+       if (nsjailSetTimer() == false) {
+               exit(1);
+       }
+
+       if (nsjconf.mode == MODE_LISTEN_TCP) {
+               nsjailListenMode(&nsjconf);
+       } else {
+               nsjailStandaloneMode(&nsjconf);
+       }
+       return 0;
+}
diff --git a/nsjail.h b/nsjail.h
new file mode 100644 (file)
index 0000000..0df5319
--- /dev/null
+++ b/nsjail.h
@@ -0,0 +1,24 @@
+/*
+
+   nsjail
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#ifndef _NSJAIL_H
+#define _NSJAIL_H
+
+#endif                         /* _NSJAIL_H */
diff --git a/sandbox.c b/sandbox.c
new file mode 100644 (file)
index 0000000..a115d6b
--- /dev/null
+++ b/sandbox.c
@@ -0,0 +1,104 @@
+/*
+
+   nsjail - seccomp-bpf sandboxing
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#include "sandbox.h"
+
+#include <errno.h>
+#include <sys/prctl.h>
+
+/* TBREMOVED */
+#include <signal.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "log.h"
+
+#if defined(__x86_64__) || defined(__i386__)
+#include "seccomp/bpf-helper.h"
+
+/*
+ * A demo policy, it disallows syslog and ptrace syscalls, both in 32 and 64
+ * modes
+ */
+static bool sandboxPrepareAndCommit(void)
+{
+       struct bpf_labels l = {.count = 0 };
+       struct sock_filter filter[] = {
+#if 0
+               LOAD_ARCH,
+               JEQ32(AUDIT_ARCH_I386, JUMP(&l, label_i386)),
+               JEQ32(AUDIT_ARCH_X86_64, JUMP(&l, label_x86_64)),
+
+               /* I386 */
+               LABEL(&l, label_i386),
+               LOAD_SYSCALL_NR,
+#define __NR_syslog_32 103
+#define __NR_uselib_32 86
+               JEQ32(__NR_syslog_32, ERRNO(ENOENT)),
+               JEQ32(__NR_uselib_32, ERRNO(ENOENT)),
+               ALLOW,
+
+               /* X86_64 */
+               LABEL(&l, label_x86_64),
+               LOAD_SYSCALL_NR,
+#define __NR_syslog_64 103
+#define __NR_uselib_64 134
+               JEQ32(__NR_syslog_64, ERRNO(ENOENT)),
+               JEQ32(__NR_uselib_64, ERRNO(ENOENT)),
+#endif /* 0 */
+               ALLOW,
+       };
+
+       struct sock_fprog prog = {
+               .filter = filter,
+               .len = (unsigned short)(sizeof(filter) / sizeof(filter[0])),
+       };
+       if (bpf_resolve_jumps(&l, filter, sizeof(filter) / sizeof(*filter)) != 0) {
+               LOG_W("bpf_resolve_jumps() failed");
+               return false;
+       }
+
+       if (prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0)) {
+               PLOG_W("prctl(PR_SET_NO_NEW_PRIVS, 1) failed");
+               return false;
+       }
+       if (prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &prog, 0, 0)) {
+               PLOG_W("prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER) failed");
+               return false;
+       }
+       return true;
+}
+#endif                         /* defined(__x86_64__) || defined(__i386__) */
+
+bool sandboxApply(struct nsjconf_t * nsjconf)
+{
+       if (nsjconf->apply_sandbox == false) {
+               return true;
+       }
+#if defined(__x86_64__) || defined(__i386__)
+       if (sandboxPrepareAndCommit() == false) {
+               return false;
+       }
+#else                          /* defined(__x86_64__) || defined(__i386__) */
+       LOG_W("There's no seccomp-bpf implementation ready for the current CPU architecture. Sandbox not enabled");
+#endif                         /* defined(__x86_64__) || defined(__i386__) */
+       return true;
+}
diff --git a/sandbox.h b/sandbox.h
new file mode 100644 (file)
index 0000000..17291ab
--- /dev/null
+++ b/sandbox.h
@@ -0,0 +1,31 @@
+/*
+
+   nsjail - seccomp-bpf sandboxing
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#ifndef _SANDBOX_H
+#define _SANDBOX_H
+
+#include <stdbool.h>
+
+#include "common.h"
+
+bool sandboxApply(struct nsjconf_t *nsjconf);
+
+#endif                         /* _SANDBOX_H */
diff --git a/seccomp/bpf-helper.c b/seccomp/bpf-helper.c
new file mode 100644 (file)
index 0000000..05cb4d5
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ * Seccomp BPF helper functions
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_ATTACH_SECCOMP_FILTER).
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "bpf-helper.h"
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+                     struct sock_filter *filter, size_t count)
+{
+       struct sock_filter *begin = filter;
+       __u8 insn = count - 1;
+
+       if (count < 1)
+               return -1;
+       /*
+       * Walk it once, backwards, to build the label table and do fixups.
+       * Since backward jumps are disallowed by BPF, this is easy.
+       */
+       filter += insn;
+       for (; filter >= begin; --insn, --filter) {
+               if (filter->code != (BPF_JMP+BPF_JA))
+                       continue;
+               switch ((filter->jt<<8)|filter->jf) {
+               case (JUMP_JT<<8)|JUMP_JF:
+                       if (labels->labels[filter->k].location == 0xffffffff) {
+                               fprintf(stderr, "Unresolved label: '%s'\n",
+                                       labels->labels[filter->k].label);
+                               return 1;
+                       }
+                       filter->k = labels->labels[filter->k].location -
+                                   (insn + 1);
+                       filter->jt = 0;
+                       filter->jf = 0;
+                       continue;
+               case (LABEL_JT<<8)|LABEL_JF:
+                       if (labels->labels[filter->k].location != 0xffffffff) {
+                               fprintf(stderr, "Duplicate label use: '%s'\n",
+                                       labels->labels[filter->k].label);
+                               return 1;
+                       }
+                       labels->labels[filter->k].location = insn;
+                       filter->k = 0; /* fall through */
+                       filter->jt = 0;
+                       filter->jf = 0;
+                       continue;
+               }
+       }
+       return 0;
+}
+
+/* Simple lookup table for labels. */
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label)
+{
+       struct __bpf_label *begin = labels->labels, *end;
+       int id;
+
+       if (labels->count == BPF_LABELS_MAX) {
+               fprintf(stderr, "Too many labels\n");
+               exit(1);
+       }
+       if (labels->count == 0) {
+               begin->label = label;
+               begin->location = 0xffffffff;
+               labels->count++;
+               return 0;
+       }
+       end = begin + labels->count;
+       for (id = 0; begin < end; ++begin, ++id) {
+               if (!strcmp(label, begin->label))
+                       return id;
+       }
+       begin->label = label;
+       begin->location = 0xffffffff;
+       labels->count++;
+       return id;
+}
+
+void seccomp_bpf_print(struct sock_filter *filter, size_t count)
+{
+       struct sock_filter *end = filter + count;
+       for ( ; filter < end; ++filter)
+               printf("{ code=%u,jt=%u,jf=%u,k=%u },\n",
+                       filter->code, filter->jt, filter->jf, filter->k);
+}
diff --git a/seccomp/bpf-helper.h b/seccomp/bpf-helper.h
new file mode 100644 (file)
index 0000000..38ee70f
--- /dev/null
@@ -0,0 +1,243 @@
+/*
+ * Example wrapper around BPF macros.
+ *
+ * Copyright (c) 2012 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ * Author: Will Drewry <wad@chromium.org>
+ *
+ * The code may be used by anyone for any purpose,
+ * and can serve as a starting point for developing
+ * applications using prctl(PR_SET_SECCOMP, 2, ...).
+ *
+ * No guarantees are provided with respect to the correctness
+ * or functionality of this code.
+ */
+#ifndef __BPF_HELPER_H__
+#define __BPF_HELPER_H__
+
+#include <asm/bitsperlong.h>   /* for __BITS_PER_LONG */
+#include <endian.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>     /* for seccomp_data */
+#include <linux/types.h>
+#include <linux/unistd.h>
+#include <stddef.h>
+
+#define BPF_LABELS_MAX 256
+struct bpf_labels {
+       int count;
+       struct __bpf_label {
+               const char *label;
+               __u32 location;
+       } labels[BPF_LABELS_MAX];
+};
+
+int bpf_resolve_jumps(struct bpf_labels *labels,
+                     struct sock_filter *filter, size_t count);
+__u32 seccomp_bpf_label(struct bpf_labels *labels, const char *label);
+void seccomp_bpf_print(struct sock_filter *filter, size_t count);
+
+#define JUMP_JT 0xff
+#define JUMP_JF 0xff
+#define LABEL_JT 0xfe
+#define LABEL_JF 0xfe
+
+#define ALLOW \
+       BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_ALLOW)
+#define DENY \
+       BPF_STMT(BPF_RET+BPF_K, SECCOMP_RET_KILL)
+#define JUMP(labels, label) \
+       BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+                JUMP_JT, JUMP_JF)
+#define LABEL(labels, label) \
+       BPF_JUMP(BPF_JMP+BPF_JA, FIND_LABEL((labels), (label)), \
+                LABEL_JT, LABEL_JF)
+#define SYSCALL(nr, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (nr), 0, 1), \
+       jt
+
+/* Lame, but just an example */
+#define FIND_LABEL(labels, label) seccomp_bpf_label((labels), #label)
+
+#define EXPAND(...) __VA_ARGS__
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define LO_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#else
+#error "Unknown endianness"
+#endif
+
+/* Map all width-sensitive operations */
+#if __BITS_PER_LONG == 32
+
+#define JEQ(x, jt) JEQ32(x, EXPAND(jt))
+#define JNE(x, jt) JNE32(x, EXPAND(jt))
+#define JGT(x, jt) JGT32(x, EXPAND(jt))
+#define JLT(x, jt) JLT32(x, EXPAND(jt))
+#define JGE(x, jt) JGE32(x, EXPAND(jt))
+#define JLE(x, jt) JLE32(x, EXPAND(jt))
+#define JA(x, jt) JA32(x, EXPAND(jt))
+#define ARG(i) ARG_32(i)
+
+#elif __BITS_PER_LONG == 64
+
+/* Ensure that we load the logically correct offset. */
+#if __BYTE_ORDER == __LITTLE_ENDIAN
+#define ENDIAN(_lo, _hi) _lo, _hi
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)]) + sizeof(__u32)
+#elif __BYTE_ORDER == __BIG_ENDIAN
+#define ENDIAN(_lo, _hi) _hi, _lo
+#define HI_ARG(idx) offsetof(struct seccomp_data, args[(idx)])
+#endif
+
+union arg64 {
+       struct {
+               __u32 ENDIAN(lo32, hi32);
+       };
+       __u64 u64;
+};
+
+#define JEQ(x, jt) \
+       JEQ64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+#define JGT(x, jt) \
+       JGT64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+#define JGE(x, jt) \
+       JGE64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+#define JNE(x, jt) \
+       JNE64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+#define JLT(x, jt) \
+       JLT64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+#define JLE(x, jt) \
+       JLE64(((union arg64){.u64 = (x)}).lo32, \
+             ((union arg64){.u64 = (x)}).hi32, \
+             EXPAND(jt))
+
+#define JA(x, jt) \
+       JA64(((union arg64){.u64 = (x)}).lo32, \
+              ((union arg64){.u64 = (x)}).hi32, \
+              EXPAND(jt))
+#define ARG(i) ARG_64(i)
+
+#else
+#error __BITS_PER_LONG value unusable.
+#endif
+
+/* Loads the arg into A */
+#define ARG_32(idx) \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx))
+
+/* Loads hi into A and lo in X */
+#define ARG_64(idx) \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, LO_ARG(idx)), \
+       BPF_STMT(BPF_ST, 0), /* lo -> M[0] */ \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, HI_ARG(idx)), \
+       BPF_STMT(BPF_ST, 1) /* hi -> M[1] */
+
+#define JEQ32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 0, 1), \
+       jt
+
+#define JNE32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (value), 1, 0), \
+       jt
+
+/* Checks the lo, then swaps to check the hi. A=lo,X=hi */
+#define JEQ64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 0, 2), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JNE64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 5, 0), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (lo), 2, 0), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JA32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (value), 0, 1), \
+       jt
+
+#define JA64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (hi), 3, 0), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JSET+BPF_K, (lo), 0, 2), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGE32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 0, 1), \
+       jt
+
+#define JLT32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (value), 1, 0), \
+       jt
+
+/* Shortcut checking if hi > arg.hi. */
+#define JGE64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (lo), 0, 2), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLT64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGE+BPF_K, (hi), 0, 4), \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JGT32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 0, 1), \
+       jt
+
+#define JLE32(value, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (value), 1, 0), \
+       jt
+
+/* Check hi > args.hi first, then do the GE checking */
+#define JGT64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 4, 0), \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 5), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 0, 2), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define JLE64(lo, hi, jt) \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (hi), 6, 0), \
+       BPF_JUMP(BPF_JMP+BPF_JEQ+BPF_K, (hi), 0, 3), \
+       BPF_STMT(BPF_LD+BPF_MEM, 0), /* swap in lo */ \
+       BPF_JUMP(BPF_JMP+BPF_JGT+BPF_K, (lo), 2, 0), \
+       BPF_STMT(BPF_LD+BPF_MEM, 1), /* passed: swap hi back in */ \
+       jt, \
+       BPF_STMT(BPF_LD+BPF_MEM, 1) /* failed: swap hi back in */
+
+#define LOAD_SYSCALL_NR \
+       BPF_STMT(BPF_LD+BPF_W+BPF_ABS, \
+                offsetof(struct seccomp_data, nr))
+
+#endif  /* __BPF_HELPER_H__ */
diff --git a/subproc.c b/subproc.c
new file mode 100644 (file)
index 0000000..87c398e
--- /dev/null
+++ b/subproc.c
@@ -0,0 +1,220 @@
+/*
+
+   nsjail - subprocess management
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+
+#include "subproc.h"
+
+#include <arpa/inet.h>
+#include <errno.h>
+#include <netinet/in.h>
+#include <sched.h>
+#include <signal.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/prctl.h>
+#include <sys/queue.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "common.h"
+#include "contain.h"
+#include "log.h"
+#include "net.h"
+#include "sandbox.h"
+
+static int subprocNewProc(struct nsjconf_t *nsjconf, int fd_in, int fd_out, int fd_err)
+{
+       if (containPrepareEnv(nsjconf) == false) {
+               exit(1);
+       }
+       if (containSetupFD(nsjconf, fd_in, fd_out, fd_err) == false) {
+               exit(1);
+       }
+       if (containMountFS(nsjconf) == false) {
+               exit(1);
+       }
+       if (containDropPrivs(nsjconf) == false) {
+               exit(1);
+       }
+       /* */
+       /* As non-root */
+       if (containSetLimits(nsjconf) == false) {
+               exit(1);
+       }
+       if (containMakeFdsCOE() == false) {
+               exit(1);
+       }
+       /* Should be the last one in the sequence */
+       if (sandboxApply(nsjconf) == false) {
+               exit(1);
+       }
+
+       char *const *env = { NULL };
+       if (nsjconf->keep_env == true) {
+               env = environ;
+       }
+
+       LOG_D("Trying to execve('%s')", nsjconf->argv[0]);
+       for (int i = 0; nsjconf->argv[i]; i++) {
+               LOG_D(" Arg[%d]: '%s'", i, nsjconf->argv[i]);
+       }
+       execve(nsjconf->argv[0], &nsjconf->argv[0], env);
+       PLOG_F("execve('%s')", nsjconf->argv[0]);
+       exit(1);
+}
+
+static void subprocAdd(struct nsjconf_t *nsjconf, pid_t pid, int sock)
+{
+       struct pids_t *p = malloc(sizeof(struct pids_t));
+       if (p == NULL) {
+               PLOG_E("malloc");
+               return;
+       }
+
+       p->pid = pid;
+       p->start = time(NULL);
+       netConnToText(sock, true /* remote */ , p->remote_txt, sizeof(p->remote_txt),
+                     &p->remote_addr);
+       LIST_INSERT_HEAD(&nsjconf->pids, p, pointers);
+
+       LOG_D("Added pid '%d' with start time '%u' to the queue for IP: '%s'", pid,
+             (unsigned int)p->start, p->remote_txt);
+}
+
+static void subprocRemove(struct nsjconf_t *nsjconf, pid_t pid)
+{
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               if (p->pid == pid) {
+                       LOG_D("Removing pid '%d' from the queue (IP:'%s', start time:'%u')", p->pid,
+                             p->remote_txt, (unsigned int)p->start);
+                       LIST_REMOVE(p, pointers);
+                       free(p);
+                       return;
+               }
+       }
+       LOG_W("PID: %d not found (?)", pid);
+}
+
+int subprocCount(struct nsjconf_t *nsjconf)
+{
+       int cnt = 0;
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               cnt++;
+       }
+       return cnt;
+}
+
+void subprocDisplay(struct nsjconf_t *nsjconf)
+{
+       LOG_I("Total number of spawned namespaces: %d", subprocCount(nsjconf));
+       time_t now = time(NULL);
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               time_t diff = now - p->start;
+               LOG_I("PID: %d, Remote host: %s, Run time: %ld sec.", p->pid, p->remote_txt, (long)diff);
+       }
+}
+
+void subprocReap(struct nsjconf_t *nsjconf)
+{
+       int status;
+       pid_t pid;
+       while ((pid = waitpid(-1, &status, WNOHANG)) > 0) {
+               if (WIFEXITED(status)) {
+                       subprocRemove(nsjconf, pid);
+                       LOG_I("PID: %d exited with status: %d, (PIDs left: %d)", pid,
+                             WEXITSTATUS(status), subprocCount(nsjconf));
+               }
+               if (WIFSIGNALED(status)) {
+                       subprocRemove(nsjconf, pid);
+                       LOG_I("PID: %d terminated with signal: %d, (PIDs left: %d)", pid,
+                             WTERMSIG(status), subprocCount(nsjconf));
+               }
+       }
+
+       time_t now = time(NULL);
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               if (nsjconf->tlimit == 0) {
+                       continue;
+               }
+               pid = p->pid;
+               time_t diff = now - p->start;
+               if (diff >= nsjconf->tlimit) {
+                       LOG_I("PID: %d run time >= time limit (%ld >= %ld) (%s). Killing it", pid,
+                             (long)diff, (long)nsjconf->tlimit, p->remote_txt);
+                       /* Probably a kernel bug - some processes cannot be killed with KILL if
+                        * they're namespaced, and in a stopped state */
+                       kill(pid, SIGCONT);
+                       PLOG_D("Sent SIGCONT to PID: %d", pid);
+                       kill(pid, SIGKILL);
+                       PLOG_D("Sent SIGKILL to PID: %d", pid);
+               }
+       }
+}
+
+void subprocKillAll(struct nsjconf_t *nsjconf)
+{
+       struct pids_t *p;
+       LIST_FOREACH(p, &nsjconf->pids, pointers) {
+               kill(p->pid, SIGKILL);
+       }
+}
+
+void subprocRunChild(struct nsjconf_t *nsjconf, int fd_in, int fd_out, int fd_err)
+{
+       if (netLimitConns(nsjconf, fd_in) == false) {
+               return;
+       }
+
+       unsigned int flags = SIGCHLD;
+       flags |= (nsjconf->clone_newnet ? CLONE_NEWNET : 0);
+       flags |= (nsjconf->clone_newuser ? CLONE_NEWUSER : 0);
+       flags |= (nsjconf->clone_newns ? CLONE_NEWNS : 0);
+       flags |= (nsjconf->clone_newpid ? CLONE_NEWPID : 0);
+       flags |= (nsjconf->clone_newipc ? CLONE_NEWIPC : 0);
+       flags |= (nsjconf->clone_newuts ? CLONE_NEWUTS : 0);
+
+       LOG_D("Creating new process with clone flags: %#x", flags);
+
+       pid_t pid = syscall(__NR_clone, flags, NULL, NULL, NULL, 0);
+       if (pid == 0) {
+               subprocNewProc(nsjconf, fd_in, fd_out, fd_err);
+       }
+       if (pid == -1) {
+               PLOG_E("clone(flags=%#x) failed. You probably need root privileges if your system "
+                      "doesn't support CLONE_NEWUSER. Alternatively, you might want to recompile your "
+                      "kernel with support for namespaces", flags);
+               return;
+       }
+
+       subprocAdd(nsjconf, pid, fd_in);
+
+       char cs_addr[64];
+       netConnToText(fd_in, true /* remote */ , cs_addr, sizeof(cs_addr), NULL);
+       LOG_I("PID: %d about to execute '%s' for %s", pid, nsjconf->argv[0], cs_addr);
+}
diff --git a/subproc.h b/subproc.h
new file mode 100644 (file)
index 0000000..b189e84
--- /dev/null
+++ b/subproc.h
@@ -0,0 +1,32 @@
+/*
+
+   nsjail - subprocess management
+   -----------------------------------------
+
+   Copyright 2014 Google Inc. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
+*/
+#ifndef _PROC_H
+#define _PROC_H
+
+#include "common.h"
+
+void subprocRunChild(struct nsjconf_t *nsjconf, int fd_in, int fd_out, int fd_err);
+void subprocReap(struct nsjconf_t *nsjconf);
+int subprocCount(struct nsjconf_t *nsjconf);
+void subprocDisplay(struct nsjconf_t *nsjconf);
+void subprocKillAll(struct nsjconf_t *nsjconf);
+
+#endif                         /* _PROC_H */