BIN = nsjail
LIBS = kafel/libkafel.a
-SRCS_CXX = caps.cc cgroup.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc
+SRCS_CXX = caps.cc cgroup.cc cgroup2.cc cmdline.cc config.cc contain.cc cpu.cc logs.cc mnt.cc net.cc nsjail.cc pid.cc sandbox.cc subproc.cc uts.cc user.cc util.cc
SRCS_PROTO = config.proto
SRCS_PB_CXX = $(SRCS_PROTO:.proto=.pb.cc)
SRCS_PB_H = $(SRCS_PROTO:.proto=.pb.h)
caps.o: caps.h nsjail.h logs.h macros.h util.h
cgroup.o: cgroup.h nsjail.h logs.h util.h
+cgroup2.o: cgroup2.h nsjail.h logs.h util.h
cmdline.o: cmdline.h nsjail.h caps.h config.h logs.h macros.h mnt.h user.h
cmdline.o: util.h
config.o: config.h nsjail.h caps.h cmdline.h config.pb.h logs.h macros.h
config.o: mnt.h user.h util.h
-contain.o: contain.h nsjail.h caps.h cgroup.h cpu.h logs.h macros.h mnt.h
+contain.o: contain.h nsjail.h caps.h cgroup.h cgroup2.h cpu.h logs.h macros.h mnt.h
contain.o: net.h pid.h user.h util.h uts.h
cpu.o: cpu.h nsjail.h logs.h util.h
logs.o: logs.h macros.h util.h nsjail.h
nsjail.o: nsjail.h cmdline.h logs.h macros.h net.h sandbox.h subproc.h util.h
pid.o: pid.h nsjail.h logs.h subproc.h
sandbox.o: sandbox.h nsjail.h kafel/include/kafel.h logs.h util.h
-subproc.o: subproc.h nsjail.h cgroup.h contain.h logs.h macros.h net.h
+subproc.o: subproc.h nsjail.h cgroup.h cgroup2.h contain.h logs.h macros.h net.h
subproc.o: sandbox.h user.h util.h
uts.o: uts.h nsjail.h logs.h
user.o: user.h nsjail.h logs.h macros.h subproc.h util.h
--- /dev/null
+/*
+
+ nsjail - cgroup2 namespacing
+ -----------------------------------------
+
+ Copyright 2014 Google Inc. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+#include "cgroup2.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <fstream>
+#include <iostream>
+#include <sstream>
+
+#include "logs.h"
+#include "util.h"
+
+namespace cgroup2 {
+
+static std::string getCgroupPath(nsjconf_t *nsjconf, pid_t pid) {
+ return nsjconf->cgroupv2_mount + "/NSJAIL." + std::to_string(pid);
+}
+
+static bool createCgroup(const std::string &cgroup_path, pid_t pid) {
+ LOG_D("Create '%s' for pid=%d", cgroup_path.c_str(), (int)pid);
+ if (mkdir(cgroup_path.c_str(), 0700) == -1 && errno != EEXIST) {
+ PLOG_W("mkdir('%s', 0700) failed", cgroup_path.c_str());
+ return false;
+ }
+
+ return true;
+}
+
+static bool writeToCgroup(
+ const std::string &cgroup_path, const std::string &resource, const std::string &value) {
+ LOG_I("Setting '%s' to '%s'", resource.c_str(), value.c_str());
+
+ if (!util::writeBufToFile(
+ (cgroup_path + "/" + resource).c_str(), value.c_str(), value.length(), O_WRONLY)) {
+ LOG_W("Could not update %s", resource.c_str());
+ return false;
+ }
+ return true;
+}
+
+static bool addPidToProcList(const std::string &cgroup_path, pid_t pid) {
+ std::string pid_str = std::to_string(pid);
+
+ LOG_D("Adding pid='%s' to cgroup.procs", pid_str.c_str());
+ if (!util::writeBufToFile((cgroup_path + "/cgroup.procs").c_str(), pid_str.c_str(),
+ pid_str.length(), O_WRONLY)) {
+ LOG_W("Could not update cgroup.procs");
+ return false;
+ }
+
+ return true;
+}
+
+static void removeCgroup(const std::string &cgroup_path) {
+ LOG_D("Remove '%s'", cgroup_path.c_str());
+ if (rmdir(cgroup_path.c_str()) == -1) {
+ PLOG_W("rmdir('%s') failed", cgroup_path.c_str());
+ }
+}
+
+static bool initNsFromParentMem(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_mem_max == (size_t)0) {
+ return true;
+ }
+
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+ return writeToCgroup(cgroup_path, "memory.max", std::to_string(nsjconf->cgroup_mem_max));
+}
+
+static bool initNsFromParentPids(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_pids_max == 0U) {
+ return true;
+ }
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+ return writeToCgroup(cgroup_path, "pids.max", std::to_string(nsjconf->cgroup_pids_max));
+}
+
+static bool initNsFromParentCpu(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_cpu_ms_per_sec == 0U) {
+ return true;
+ }
+
+ std::string cgroup_path = getCgroupPath(nsjconf, pid);
+ RETURN_ON_FAILURE(createCgroup(cgroup_path, pid));
+ RETURN_ON_FAILURE(addPidToProcList(cgroup_path, pid));
+
+ // The maximum bandwidth limit in the format: `$MAX $PERIOD`.
+ // This indicates that the group may consume up to $MAX in each $PERIOD
+ // duration.
+ std::string cpu_ms_per_sec_str = std::to_string(nsjconf->cgroup_cpu_ms_per_sec * 1000U);
+ cpu_ms_per_sec_str += " 1000000";
+ return writeToCgroup(cgroup_path, "cpu.max", cpu_ms_per_sec_str);
+}
+
+bool initNsFromParent(nsjconf_t *nsjconf, pid_t pid) {
+ RETURN_ON_FAILURE(initNsFromParentMem(nsjconf, pid));
+ RETURN_ON_FAILURE(initNsFromParentPids(nsjconf, pid));
+ return initNsFromParentCpu(nsjconf, pid);
+}
+
+void finishFromParent(nsjconf_t *nsjconf, pid_t pid) {
+ if (nsjconf->cgroup_mem_max != (size_t)0 || nsjconf->cgroup_pids_max != 0U ||
+ nsjconf->cgroup_cpu_ms_per_sec != 0U) {
+ removeCgroup(getCgroupPath(nsjconf, pid));
+ }
+}
+
+} // namespace cgroup2
--- /dev/null
+/*
+
+ nsjail - cgroup2 namespacing
+ -----------------------------------------
+
+ Copyright 2014 Google Inc. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
+*/
+
+#ifndef NS_CGROUP2_H
+#define NS_CGROUP2_H
+
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "nsjail.h"
+
+namespace cgroup2 {
+
+bool initNsFromParent(nsjconf_t* nsjconf, pid_t pid);
+bool initNs(void);
+void finishFromParent(nsjconf_t* nsjconf, pid_t pid);
+
+} // namespace cgroup2
+
+#endif /* _CGROUP2_H */
{ { "cgroup_cpu_ms_per_sec", required_argument, NULL, 0x0831 }, "Number of milliseconds of CPU time per second that the process group can use (default: '0' - no limit)" },
{ { "cgroup_cpu_mount", required_argument, NULL, 0x0832 }, "Location of cpu cgroup FS (default: '/sys/fs/cgroup/net_cls')" },
{ { "cgroup_cpu_parent", required_argument, NULL, 0x0833 }, "Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')" },
+ { { "cgroupv2_mount", required_argument, NULL, 0x0834}, "Location of cgroupv2 directory (default: '/sys/fs/cgroup')"},
+ { { "use_cgroupv2", no_argument, NULL, 0x0835}, "Use cgroup v2"},
{ { "iface_no_lo", no_argument, NULL, 0x700 }, "Don't bring the 'lo' interface up" },
{ { "iface_own", required_argument, NULL, 0x704 }, "Move this existing network interface into the new NET namespace. Can be specified multiple times" },
{ { "macvlan_iface", required_argument, NULL, 'I' }, "Interface which will be cloned (MACVLAN) and put inside the subprocess' namespace as 'vs'" },
nsjconf->cgroup_cpu_mount = "/sys/fs/cgroup/cpu";
nsjconf->cgroup_cpu_parent = "NSJAIL";
nsjconf->cgroup_cpu_ms_per_sec = 0U;
+ nsjconf->cgroupv2_mount = "/sys/fs/cgroup";
+ nsjconf->use_cgroupv2 = false;
nsjconf->iface_lo = true;
nsjconf->iface_vs_ip = "0.0.0.0";
nsjconf->iface_vs_nm = "255.255.255.0";
case 0x833:
nsjconf->cgroup_cpu_parent = optarg;
break;
+ case 0x834:
+ nsjconf->cgroupv2_mount = optarg;
+ break;
+ case 0x835:
+ nsjconf->use_cgroupv2 = true;
+ break;
case 'P':
nsjconf->kafel_file_path = optarg;
break;
nsjconf->cgroup_cpu_ms_per_sec = njc.cgroup_cpu_ms_per_sec();
nsjconf->cgroup_cpu_mount = njc.cgroup_cpu_mount();
nsjconf->cgroup_cpu_parent = njc.cgroup_cpu_parent();
+ nsjconf->cgroupv2_mount = njc.cgroupv2_mount();
+ nsjconf->use_cgroupv2 = njc.use_cgroupv2();
nsjconf->iface_lo = !(njc.iface_no_lo());
for (ssize_t i = 0; i < njc.iface_own().size(); i++) {
enum Mode {
LISTEN = 0; /* Listening on a TCP port */
- ONCE = 1; /* Running the command once only */
- RERUN = 2; /* Re-executing the command (forever) */
+ ONCE = 1; /* Running the command once only */
+ RERUN = 2; /* Re-executing the command (forever) */
EXECVE = 3; /* Executing command w/o the supervisor */
}
/* Should be self explanatory */
enum LogLevel {
- DEBUG = 0; /* Equivalent to the '-v' cmd-line option */
- INFO = 1; /* Default level */
+ DEBUG = 0; /* Equivalent to the '-v' cmd-line option */
+ INFO = 1; /* Default level */
WARNING = 2; /* Equivalent to the '-q' cmd-line option */
ERROR = 3;
FATAL = 4;
/* Binary path (with arguments) to be executed. If not specified here, it
can be specified with cmd-line as "-- /path/to/command arg1 arg2" */
optional Exe exec_bin = 81;
+
+ /* Mount point for cgroup v2 in your system */
+ optional string cgroupv2_mount = 82 [default = "/sys/fs/cgroup"];
+
+ /* Use cgroup v2 */
+ optional bool use_cgroupv2 = 83 [default = false];
}
\fB\-\-cgroup_cpu_parent\fR VALUE
Which pre-existing cpu cgroup to use as a parent (default: 'NSJAIL')
.TP
+\fB\-\-cgroupv2_mount\fR VALUE
+Location of cgroup v2 directory (default: '/sys/fs/cgroup')
+.TP
+\fB\-\-use_cgroupv2\fR
+Use cgroup v2
+.TP
\fB\-\-iface_no_lo\fR
Don't bring the 'lo' interface up
.TP
std::string cgroup_cpu_mount;
std::string cgroup_cpu_parent;
unsigned int cgroup_cpu_ms_per_sec;
+ std::string cgroupv2_mount;
+ bool use_cgroupv2;
std::string kafel_file_path;
std::string kafel_string;
struct sock_fprog seccomp_fprog;
#include <vector>
#include "cgroup.h"
+#include "cgroup2.h"
#include "contain.h"
#include "logs.h"
#include "macros.h"
LOG_E("Couldn't initialize net user namespace");
return;
}
- if (!cgroup::initNsFromParent(nsjconf, getpid())) {
+ if (nsjconf->use_cgroupv2) {
+ if (!cgroup2::initNsFromParent(nsjconf, getpid())) {
+ LOG_E("Couldn't initialize net user namespace");
+ return;
+ }
+ } else if (!cgroup::initNsFromParent(nsjconf, getpid())) {
LOG_E("Couldn't initialize net user namespace");
return;
}
int status;
if (wait4(pid, &status, should_wait ? 0 : WNOHANG, NULL) == pid) {
- cgroup::finishFromParent(nsjconf, pid);
+ if (nsjconf->use_cgroupv2) {
+ cgroup2::finishFromParent(nsjconf, pid);
+ } else {
+ cgroup::finishFromParent(nsjconf, pid);
+ }
std::string remote_txt = "[UNKNOWN]";
const pids_t* elem = getPidElem(nsjconf, pid);
LOG_E("Couldn't initialize net namespace for pid=%d", pid);
return false;
}
- if (!cgroup::initNsFromParent(nsjconf, pid)) {
+
+ if (nsjconf->use_cgroupv2) {
+ if (!cgroup2::initNsFromParent(nsjconf, pid)) {
+ LOG_E("Couldn't initialize cgroup 2 user namespace for pid=%d", pid);
+ exit(0xff);
+ }
+ } else if (!cgroup::initNsFromParent(nsjconf, pid)) {
LOG_E("Couldn't initialize cgroup user namespace for pid=%d", pid);
exit(0xff);
}
+
if (!user::initNsFromParent(nsjconf, pid)) {
LOG_E("Couldn't initialize user namespace for pid=%d", pid);
return false;