Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm...
authorLinus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 03:39:26 +0000 (20:39 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 03:39:26 +0000 (20:39 -0700)
Pull user namespace rlimit handling update from Eric Biederman:
 "This is the work mainly by Alexey Gladkov to limit rlimits to the
  rlimits of the user that created a user namespace, and to allow users
  to have stricter limits on the resources created within a user
  namespace."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  cred: add missing return error code when set_cred_ucounts() failed
  ucounts: Silence warning in dec_rlimit_ucounts
  ucounts: Set ucount_max to the largest positive value the type can hold
  kselftests: Add test to check for rlimit changes in different user namespaces
  Reimplement RLIMIT_MEMLOCK on top of ucounts
  Reimplement RLIMIT_SIGPENDING on top of ucounts
  Reimplement RLIMIT_MSGQUEUE on top of ucounts
  Reimplement RLIMIT_NPROC on top of ucounts
  Use atomic_t for ucounts reference counting
  Add a reference to ucounts for each cred
  Increase size of ucounts to atomic_long_t

29 files changed:
fs/exec.c
fs/hugetlbfs/inode.c
fs/proc/array.c
include/linux/cred.h
include/linux/hugetlb.h
include/linux/mm.h
include/linux/sched/user.h
include/linux/shmem_fs.h
include/linux/signal_types.h
include/linux/user_namespace.h
ipc/mqueue.c
ipc/shm.c
kernel/cred.c
kernel/exit.c
kernel/fork.c
kernel/signal.c
kernel/sys.c
kernel/ucount.c
kernel/user.c
kernel/user_namespace.c
mm/memfd.c
mm/mlock.c
mm/mmap.c
mm/shmem.c
tools/testing/selftests/Makefile
tools/testing/selftests/rlimits/.gitignore [new file with mode: 0644]
tools/testing/selftests/rlimits/Makefile [new file with mode: 0644]
tools/testing/selftests/rlimits/config [new file with mode: 0644]
tools/testing/selftests/rlimits/rlimits-per-userns.c [new file with mode: 0644]

index 18594f1..f2bcdbe 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm)
        WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
        flush_signal_handlers(me, 0);
 
+       retval = set_cred_ucounts(bprm->cred);
+       if (retval < 0)
+               goto out_unlock;
+
        /*
         * install the new credentials for this executable
         */
@@ -1874,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
         * whether NPROC limit is still exceeded.
         */
        if ((current->flags & PF_NPROC_EXCEEDED) &&
-           atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) {
+           is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                retval = -EAGAIN;
                goto out_ret;
        }
index 30dee68..926eeb9 100644 (file)
@@ -1446,7 +1446,7 @@ static int get_hstate_idx(int page_size_log)
  * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
  */
 struct file *hugetlb_file_setup(const char *name, size_t size,
-                               vm_flags_t acctflag, struct user_struct **user,
+                               vm_flags_t acctflag, struct ucounts **ucounts,
                                int creat_flags, int page_size_log)
 {
        struct inode *inode;
@@ -1458,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
        if (hstate_idx < 0)
                return ERR_PTR(-ENODEV);
 
-       *user = NULL;
+       *ucounts = NULL;
        mnt = hugetlbfs_vfsmount[hstate_idx];
        if (!mnt)
                return ERR_PTR(-ENOENT);
 
        if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
-               *user = current_user();
-               if (user_shm_lock(size, *user)) {
+               *ucounts = current_ucounts();
+               if (user_shm_lock(size, *ucounts)) {
                        task_lock(current);
                        pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
                                current->comm, current->pid);
                        task_unlock(current);
                } else {
-                       *user = NULL;
+                       *ucounts = NULL;
                        return ERR_PTR(-EPERM);
                }
        }
@@ -1498,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
 
        iput(inode);
 out:
-       if (*user) {
-               user_shm_unlock(size, *user);
-               *user = NULL;
+       if (*ucounts) {
+               user_shm_unlock(size, *ucounts);
+               *ucounts = NULL;
        }
        return file;
 }
index 7ec5917..ee0ce8c 100644 (file)
@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
                collect_sigign_sigcatch(p, &ignored, &caught);
                num_threads = get_nr_threads(p);
                rcu_read_lock();  /* FIXME: is this correct? */
-               qsize = atomic_read(&__task_cred(p)->user->sigpending);
+               qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
                rcu_read_unlock();
                qlim = task_rlimit(p, RLIMIT_SIGPENDING);
                unlock_task_sighand(p, &flags);
index 1497132..fcbc688 100644 (file)
@@ -143,6 +143,7 @@ struct cred {
 #endif
        struct user_struct *user;       /* real user ID subscription */
        struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
+       struct ucounts *ucounts;
        struct group_info *group_info;  /* supplementary groups for euid/fsgid */
        /* RCU deletion */
        union {
@@ -169,6 +170,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
 extern int set_create_files_as(struct cred *, struct inode *);
 extern int cred_fscmp(const struct cred *, const struct cred *);
 extern void __init cred_init(void);
+extern int set_cred_ucounts(struct cred *);
 
 /*
  * check for validity of credentials
@@ -369,6 +371,7 @@ static inline void put_cred(const struct cred *_cred)
 
 #define task_uid(task)         (task_cred_xxx((task), uid))
 #define task_euid(task)                (task_cred_xxx((task), euid))
+#define task_ucounts(task)     (task_cred_xxx((task), ucounts))
 
 #define current_cred_xxx(xxx)                  \
 ({                                             \
@@ -385,6 +388,7 @@ static inline void put_cred(const struct cred *_cred)
 #define current_fsgid()        (current_cred_xxx(fsgid))
 #define current_cap()          (current_cred_xxx(cap_effective))
 #define current_user()         (current_cred_xxx(user))
+#define current_ucounts()      (current_cred_xxx(ucounts))
 
 extern struct user_namespace init_user_ns;
 #ifdef CONFIG_USER_NS
index 3c01176..8ba79dc 100644 (file)
@@ -451,7 +451,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
 extern const struct file_operations hugetlbfs_file_operations;
 extern const struct vm_operations_struct hugetlb_vm_ops;
 struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
-                               struct user_struct **user, int creat_flags,
+                               struct ucounts **ucounts, int creat_flags,
                                int page_size_log);
 
 static inline bool is_file_hugepages(struct file *file)
@@ -471,7 +471,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
 #define is_file_hugepages(file)                        false
 static inline struct file *
 hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
-               struct user_struct **user, int creat_flags,
+               struct ucounts **ucounts, int creat_flags,
                int page_size_log)
 {
        return ERR_PTR(-ENOSYS);
index 8ae3162..01ecf9e 100644 (file)
@@ -1709,8 +1709,8 @@ extern bool can_do_mlock(void);
 #else
 static inline bool can_do_mlock(void) { return false; }
 #endif
-extern int user_shm_lock(size_t, struct user_struct *);
-extern void user_shm_unlock(size_t, struct user_struct *);
+extern int user_shm_lock(size_t, struct ucounts *);
+extern void user_shm_unlock(size_t, struct ucounts *);
 
 /*
  * Parameter block passed down to zap_pte_range in exceptional cases.
index 3632c5d..2462f7d 100644 (file)
  */
 struct user_struct {
        refcount_t __count;     /* reference count */
-       atomic_t processes;     /* How many processes does this user have? */
-       atomic_t sigpending;    /* How many pending signals does this user have? */
 #ifdef CONFIG_EPOLL
        atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
 #endif
-#ifdef CONFIG_POSIX_MQUEUE
-       /* protected by mq_lock */
-       unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
-#endif
-       unsigned long locked_shm; /* How many pages of mlocked shm ? */
        unsigned long unix_inflight;    /* How many files in flight in unix sockets */
        atomic_long_t pipe_bufs;  /* how many pages are allocated in pipe buffers */
 
index d82b6f3..aa77dcd 100644 (file)
@@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
 extern int shmem_zero_setup(struct vm_area_struct *);
 extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
                unsigned long len, unsigned long pgoff, unsigned long flags);
-extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
 #ifdef CONFIG_SHMEM
 extern const struct address_space_operations shmem_aops;
 static inline bool shmem_mapping(struct address_space *mapping)
index 68e06c7..34cb28b 100644 (file)
@@ -13,6 +13,8 @@ typedef struct kernel_siginfo {
        __SIGINFO;
 } kernel_siginfo_t;
 
+struct ucounts;
+
 /*
  * Real Time signals may be queued.
  */
@@ -21,7 +23,7 @@ struct sigqueue {
        struct list_head list;
        int flags;
        kernel_siginfo_t info;
-       struct user_struct *user;
+       struct ucounts *ucounts;
 };
 
 /* flags values. */
index 1d08dbb..eb70cab 100644 (file)
@@ -54,9 +54,15 @@ enum ucount_type {
        UCOUNT_FANOTIFY_GROUPS,
        UCOUNT_FANOTIFY_MARKS,
 #endif
+       UCOUNT_RLIMIT_NPROC,
+       UCOUNT_RLIMIT_MSGQUEUE,
+       UCOUNT_RLIMIT_SIGPENDING,
+       UCOUNT_RLIMIT_MEMLOCK,
        UCOUNT_COUNTS,
 };
 
+#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
+
 struct user_namespace {
        struct uid_gid_map      uid_map;
        struct uid_gid_map      gid_map;
@@ -92,23 +98,42 @@ struct user_namespace {
        struct ctl_table_header *sysctls;
 #endif
        struct ucounts          *ucounts;
-       int ucount_max[UCOUNT_COUNTS];
+       long ucount_max[UCOUNT_COUNTS];
 } __randomize_layout;
 
 struct ucounts {
        struct hlist_node node;
        struct user_namespace *ns;
        kuid_t uid;
-       int count;
-       atomic_t ucount[UCOUNT_COUNTS];
+       atomic_t count;
+       atomic_long_t ucount[UCOUNT_COUNTS];
 };
 
 extern struct user_namespace init_user_ns;
+extern struct ucounts init_ucounts;
 
 bool setup_userns_sysctls(struct user_namespace *ns);
 void retire_userns_sysctls(struct user_namespace *ns);
 struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
 void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
+struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
+void put_ucounts(struct ucounts *ucounts);
+
+static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
+{
+       return atomic_long_read(&ucounts->ucount[type]);
+}
+
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
+
+static inline void set_rlimit_ucount_max(struct user_namespace *ns,
+               enum ucount_type type, unsigned long max)
+{
+       ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX;
+}
 
 #ifdef CONFIG_USER_NS
 
index 4e4e611..5becca9 100644 (file)
@@ -144,7 +144,7 @@ struct mqueue_inode_info {
        struct pid *notify_owner;
        u32 notify_self_exec_id;
        struct user_namespace *notify_user_ns;
-       struct user_struct *user;       /* user who created, for accounting */
+       struct ucounts *ucounts;        /* user who created, for accounting */
        struct sock *notify_sock;
        struct sk_buff *notify_cookie;
 
@@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
                struct ipc_namespace *ipc_ns, umode_t mode,
                struct mq_attr *attr)
 {
-       struct user_struct *u = current_user();
        struct inode *inode;
        int ret = -ENOMEM;
 
@@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
                info->notify_owner = NULL;
                info->notify_user_ns = NULL;
                info->qsize = 0;
-               info->user = NULL;      /* set when all is ok */
+               info->ucounts = NULL;   /* set when all is ok */
                info->msg_tree = RB_ROOT;
                info->msg_tree_rightmost = NULL;
                info->node_cache = NULL;
@@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
                if (mq_bytes + mq_treesize < mq_bytes)
                        goto out_inode;
                mq_bytes += mq_treesize;
-               spin_lock(&mq_lock);
-               if (u->mq_bytes + mq_bytes < u->mq_bytes ||
-                   u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) {
+               info->ucounts = get_ucounts(current_ucounts());
+               if (info->ucounts) {
+                       long msgqueue;
+
+                       spin_lock(&mq_lock);
+                       msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
+                       if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
+                               dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
+                               spin_unlock(&mq_lock);
+                               put_ucounts(info->ucounts);
+                               info->ucounts = NULL;
+                               /* mqueue_evict_inode() releases info->messages */
+                               ret = -EMFILE;
+                               goto out_inode;
+                       }
                        spin_unlock(&mq_lock);
-                       /* mqueue_evict_inode() releases info->messages */
-                       ret = -EMFILE;
-                       goto out_inode;
                }
-               u->mq_bytes += mq_bytes;
-               spin_unlock(&mq_lock);
-
-               /* all is ok */
-               info->user = get_uid(u);
        } else if (S_ISDIR(mode)) {
                inc_nlink(inode);
                /* Some things misbehave if size == 0 on a directory */
@@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode)
 static void mqueue_evict_inode(struct inode *inode)
 {
        struct mqueue_inode_info *info;
-       struct user_struct *user;
        struct ipc_namespace *ipc_ns;
        struct msg_msg *msg, *nmsg;
        LIST_HEAD(tmp_msg);
@@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode)
                free_msg(msg);
        }
 
-       user = info->user;
-       if (user) {
+       if (info->ucounts) {
                unsigned long mq_bytes, mq_treesize;
 
                /* Total amount of bytes accounted for the mqueue */
@@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode)
                                          info->attr.mq_msgsize);
 
                spin_lock(&mq_lock);
-               user->mq_bytes -= mq_bytes;
+               dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
                /*
                 * get_ns_from_inode() ensures that the
                 * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
@@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode)
                if (ipc_ns)
                        ipc_ns->mq_queues_count--;
                spin_unlock(&mq_lock);
-               free_uid(user);
+               put_ucounts(info->ucounts);
+               info->ucounts = NULL;
        }
        if (ipc_ns)
                put_ipc_ns(ipc_ns);
index febd88d..003234f 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */
        time64_t                shm_ctim;
        struct pid              *shm_cprid;
        struct pid              *shm_lprid;
-       struct user_struct      *mlock_user;
+       struct ucounts          *mlock_ucounts;
 
        /* The task created the shm object.  NULL if the task is dead. */
        struct task_struct      *shm_creator;
@@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
        shm_rmid(ns, shp);
        shm_unlock(shp);
        if (!is_file_hugepages(shm_file))
-               shmem_lock(shm_file, 0, shp->mlock_user);
-       else if (shp->mlock_user)
+               shmem_lock(shm_file, 0, shp->mlock_ucounts);
+       else if (shp->mlock_ucounts)
                user_shm_unlock(i_size_read(file_inode(shm_file)),
-                               shp->mlock_user);
+                               shp->mlock_ucounts);
        fput(shm_file);
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
@@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
        shp->shm_perm.key = key;
        shp->shm_perm.mode = (shmflg & S_IRWXUGO);
-       shp->mlock_user = NULL;
+       shp->mlock_ucounts = NULL;
 
        shp->shm_perm.security = NULL;
        error = security_shm_alloc(&shp->shm_perm);
@@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
                if (shmflg & SHM_NORESERVE)
                        acctflag = VM_NORESERVE;
                file = hugetlb_file_setup(name, hugesize, acctflag,
-                                 &shp->mlock_user, HUGETLB_SHMFS_INODE,
+                                 &shp->mlock_ucounts, HUGETLB_SHMFS_INODE,
                                (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
        } else {
                /*
@@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 no_id:
        ipc_update_pid(&shp->shm_cprid, NULL);
        ipc_update_pid(&shp->shm_lprid, NULL);
-       if (is_file_hugepages(file) && shp->mlock_user)
-               user_shm_unlock(size, shp->mlock_user);
+       if (is_file_hugepages(file) && shp->mlock_ucounts)
+               user_shm_unlock(size, shp->mlock_ucounts);
        fput(file);
        ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
        return error;
@@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
                goto out_unlock0;
 
        if (cmd == SHM_LOCK) {
-               struct user_struct *user = current_user();
+               struct ucounts *ucounts = current_ucounts();
 
-               err = shmem_lock(shm_file, 1, user);
+               err = shmem_lock(shm_file, 1, ucounts);
                if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
                        shp->shm_perm.mode |= SHM_LOCKED;
-                       shp->mlock_user = user;
+                       shp->mlock_ucounts = ucounts;
                }
                goto out_unlock0;
        }
@@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
        /* SHM_UNLOCK */
        if (!(shp->shm_perm.mode & SHM_LOCKED))
                goto out_unlock0;
-       shmem_lock(shm_file, 0, shp->mlock_user);
+       shmem_lock(shm_file, 0, shp->mlock_ucounts);
        shp->shm_perm.mode &= ~SHM_LOCKED;
-       shp->mlock_user = NULL;
+       shp->mlock_ucounts = NULL;
        get_file(shm_file);
        ipc_unlock_object(&shp->shm_perm);
        rcu_read_unlock();
index e1d274c..e6fd2b3 100644 (file)
@@ -60,6 +60,7 @@ struct cred init_cred = {
        .user                   = INIT_USER,
        .user_ns                = &init_user_ns,
        .group_info             = &init_groups,
+       .ucounts                = &init_ucounts,
 };
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
        if (cred->group_info)
                put_group_info(cred->group_info);
        free_uid(cred->user);
+       if (cred->ucounts)
+               put_ucounts(cred->ucounts);
        put_user_ns(cred->user_ns);
        kmem_cache_free(cred_jar, cred);
 }
@@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void)
 #ifdef CONFIG_DEBUG_CREDENTIALS
        new->magic = CRED_MAGIC;
 #endif
+       new->ucounts = get_ucounts(&init_ucounts);
 
        if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
                goto error;
@@ -284,6 +288,11 @@ struct cred *prepare_creds(void)
 
        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;
+
+       new->ucounts = get_ucounts(new->ucounts);
+       if (!new->ucounts)
+               goto error;
+
        validate_creds(new);
        return new;
 
@@ -351,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                kdebug("share_creds(%p{%d,%d})",
                       p->cred, atomic_read(&p->cred->usage),
                       read_cred_subscribers(p->cred));
-               atomic_inc(&p->cred->user->processes);
+               inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
                return 0;
        }
 
@@ -363,6 +372,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
                ret = create_user_ns(new);
                if (ret < 0)
                        goto error_put;
+               ret = set_cred_ucounts(new);
+               if (ret < 0)
+                       goto error_put;
        }
 
 #ifdef CONFIG_KEYS
@@ -384,8 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        }
 #endif
 
-       atomic_inc(&new->user->processes);
        p->cred = p->real_cred = get_cred(new);
+       inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        alter_cred_subscribers(new, 2);
        validate_creds(new);
        return 0;
@@ -485,12 +497,12 @@ int commit_creds(struct cred *new)
         * in set_user().
         */
        alter_cred_subscribers(new, 2);
-       if (new->user != old->user)
-               atomic_inc(&new->user->processes);
+       if (new->user != old->user || new->user_ns != old->user_ns)
+               inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        rcu_assign_pointer(task->real_cred, new);
        rcu_assign_pointer(task->cred, new);
        if (new->user != old->user)
-               atomic_dec(&old->user->processes);
+               dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
        alter_cred_subscribers(old, -2);
 
        /* send notifications */
@@ -653,6 +665,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
 }
 EXPORT_SYMBOL(cred_fscmp);
 
+int set_cred_ucounts(struct cred *new)
+{
+       struct task_struct *task = current;
+       const struct cred *old = task->real_cred;
+       struct ucounts *old_ucounts = new->ucounts;
+
+       if (new->user == old->user && new->user_ns == old->user_ns)
+               return 0;
+
+       /*
+        * This optimization is needed because alloc_ucounts() uses locks
+        * for table lookups.
+        */
+       if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
+               return 0;
+
+       if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
+               return -EAGAIN;
+
+       if (old_ucounts)
+               put_ucounts(old_ucounts);
+
+       return 0;
+}
+
 /*
  * initialise the credentials stuff
  */
@@ -719,6 +756,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
        if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
                goto error;
 
+       new->ucounts = get_ucounts(new->ucounts);
+       if (!new->ucounts)
+               goto error;
+
        put_cred(old);
        validate_creds(new);
        return new;
index 65809fa..9a89e7f 100644 (file)
@@ -188,7 +188,7 @@ repeat:
        /* don't need to get the RCU readlock here - the process is dead and
         * can't be modifying its own credentials. But shut RCU-lockdep up */
        rcu_read_lock();
-       atomic_dec(&__task_cred(p)->user->processes);
+       dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        rcu_read_unlock();
 
        cgroup_release(p);
index 4820c2a..b4386ff 100644 (file)
@@ -825,9 +825,14 @@ void __init fork_init(void)
        init_task.signal->rlim[RLIMIT_SIGPENDING] =
                init_task.signal->rlim[RLIMIT_NPROC];
 
-       for (i = 0; i < UCOUNT_COUNTS; i++)
+       for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
                init_user_ns.ucount_max[i] = max_threads/2;
 
+       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
+       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
+       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
+       set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+
 #ifdef CONFIG_VMAP_STACK
        cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
                          NULL, free_vm_stack_cache);
@@ -1978,8 +1983,7 @@ static __latent_entropy struct task_struct *copy_process(
        DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
 #endif
        retval = -EAGAIN;
-       if (atomic_read(&p->real_cred->user->processes) >=
-                       task_rlimit(p, RLIMIT_NPROC)) {
+       if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
                if (p->real_cred->user != INIT_USER &&
                    !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
                        goto bad_fork_free;
@@ -2388,7 +2392,7 @@ bad_fork_cleanup_threadgroup_lock:
 #endif
        delayacct_tsk_free(p);
 bad_fork_cleanup_count:
-       atomic_dec(&p->cred->user->processes);
+       dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
        exit_creds(p);
 bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
@@ -3001,6 +3005,12 @@ int ksys_unshare(unsigned long unshare_flags)
        if (err)
                goto bad_unshare_cleanup_cred;
 
+       if (new_cred) {
+               err = set_cred_ucounts(new_cred);
+               if (err)
+                       goto bad_unshare_cleanup_cred;
+       }
+
        if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
                if (do_sysvsem) {
                        /*
index 20d1d89..de09203 100644 (file)
@@ -412,8 +412,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
                 int override_rlimit, const unsigned int sigqueue_flags)
 {
        struct sigqueue *q = NULL;
-       struct user_struct *user;
-       int sigpending;
+       struct ucounts *ucounts = NULL;
+       long sigpending;
 
        /*
         * Protect access to @t credentials. This can go away when all
@@ -424,27 +424,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
         * changes from/to zero.
         */
        rcu_read_lock();
-       user = __task_cred(t)->user;
-       sigpending = atomic_inc_return(&user->sigpending);
+       ucounts = task_ucounts(t);
+       sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
        if (sigpending == 1)
-               get_uid(user);
+               ucounts = get_ucounts(ucounts);
        rcu_read_unlock();
 
-       if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
+       if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
                q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
        } else {
                print_dropped_signal(sig);
        }
 
        if (unlikely(q == NULL)) {
-               if (atomic_dec_and_test(&user->sigpending))
-                       free_uid(user);
+               if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
+                       put_ucounts(ucounts);
        } else {
                INIT_LIST_HEAD(&q->list);
                q->flags = sigqueue_flags;
-               q->user = user;
+               q->ucounts = ucounts;
        }
-
        return q;
 }
 
@@ -452,8 +451,10 @@ static void __sigqueue_free(struct sigqueue *q)
 {
        if (q->flags & SIGQUEUE_PREALLOC)
                return;
-       if (atomic_dec_and_test(&q->user->sigpending))
-               free_uid(q->user);
+       if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
+               put_ucounts(q->ucounts);
+               q->ucounts = NULL;
+       }
        kmem_cache_free(sigqueue_cachep, q);
 }
 
index 9de46a4..ef1a78f 100644 (file)
@@ -479,7 +479,7 @@ static int set_user(struct cred *new)
         * for programs doing set*uid()+execve() by harmlessly deferring the
         * failure to the execve() stage.
         */
-       if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) &&
+       if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
                        new_user != INIT_USER)
                current->flags |= PF_NPROC_EXCEEDED;
        else
@@ -558,6 +558,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
        if (retval < 0)
                goto error;
 
+       retval = set_cred_ucounts(new);
+       if (retval < 0)
+               goto error;
+
        return commit_creds(new);
 
 error:
@@ -616,6 +620,10 @@ long __sys_setuid(uid_t uid)
        if (retval < 0)
                goto error;
 
+       retval = set_cred_ucounts(new);
+       if (retval < 0)
+               goto error;
+
        return commit_creds(new);
 
 error:
@@ -691,6 +699,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
        if (retval < 0)
                goto error;
 
+       retval = set_cred_ucounts(new);
+       if (retval < 0)
+               goto error;
+
        return commit_creds(new);
 
 error:
index 8d8874f..87799e2 100644 (file)
@@ -8,6 +8,12 @@
 #include <linux/kmemleak.h>
 #include <linux/user_namespace.h>
 
+struct ucounts init_ucounts = {
+       .ns    = &init_user_ns,
+       .uid   = GLOBAL_ROOT_UID,
+       .count = ATOMIC_INIT(1),
+};
+
 #define UCOUNTS_HASHTABLE_BITS 10
 static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
 static DEFINE_SPINLOCK(ucounts_lock);
@@ -78,6 +84,10 @@ static struct ctl_table user_table[] = {
        UCOUNT_ENTRY("max_fanotify_groups"),
        UCOUNT_ENTRY("max_fanotify_marks"),
 #endif
+       { },
+       { },
+       { },
+       { },
        { }
 };
 #endif /* CONFIG_SYSCTL */
@@ -129,7 +139,24 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
        return NULL;
 }
 
-static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
+static void hlist_add_ucounts(struct ucounts *ucounts)
+{
+       struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
+       spin_lock_irq(&ucounts_lock);
+       hlist_add_head(&ucounts->node, hashent);
+       spin_unlock_irq(&ucounts_lock);
+}
+
+struct ucounts *get_ucounts(struct ucounts *ucounts)
+{
+       if (ucounts && atomic_add_negative(1, &ucounts->count)) {
+               put_ucounts(ucounts);
+               ucounts = NULL;
+       }
+       return ucounts;
+}
+
+struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
 {
        struct hlist_head *hashent = ucounts_hashentry(ns, uid);
        struct ucounts *ucounts, *new;
@@ -145,7 +172,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
 
                new->ns = ns;
                new->uid = uid;
-               new->count = 0;
+               atomic_set(&new->count, 1);
 
                spin_lock_irq(&ucounts_lock);
                ucounts = find_ucounts(ns, uid, hashent);
@@ -153,40 +180,35 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
                        kfree(new);
                } else {
                        hlist_add_head(&new->node, hashent);
-                       ucounts = new;
+                       spin_unlock_irq(&ucounts_lock);
+                       return new;
                }
        }
-       if (ucounts->count == INT_MAX)
-               ucounts = NULL;
-       else
-               ucounts->count += 1;
        spin_unlock_irq(&ucounts_lock);
+       ucounts = get_ucounts(ucounts);
        return ucounts;
 }
 
-static void put_ucounts(struct ucounts *ucounts)
+void put_ucounts(struct ucounts *ucounts)
 {
        unsigned long flags;
 
-       spin_lock_irqsave(&ucounts_lock, flags);
-       ucounts->count -= 1;
-       if (!ucounts->count)
+       if (atomic_dec_and_test(&ucounts->count)) {
+               spin_lock_irqsave(&ucounts_lock, flags);
                hlist_del_init(&ucounts->node);
-       else
-               ucounts = NULL;
-       spin_unlock_irqrestore(&ucounts_lock, flags);
-
-       kfree(ucounts);
+               spin_unlock_irqrestore(&ucounts_lock, flags);
+               kfree(ucounts);
+       }
 }
 
-static inline bool atomic_inc_below(atomic_t *v, int u)
+static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
 {
-       int c, old;
-       c = atomic_read(v);
+       long c, old;
+       c = atomic_long_read(v);
        for (;;) {
                if (unlikely(c >= u))
                        return false;
-               old = atomic_cmpxchg(v, c, c+1);
+               old = atomic_long_cmpxchg(v, c, c+1);
                if (likely(old == c))
                        return true;
                c = old;
@@ -198,19 +220,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
 {
        struct ucounts *ucounts, *iter, *bad;
        struct user_namespace *tns;
-       ucounts = get_ucounts(ns, uid);
+       ucounts = alloc_ucounts(ns, uid);
        for (iter = ucounts; iter; iter = tns->ucounts) {
-               int max;
+               long max;
                tns = iter->ns;
                max = READ_ONCE(tns->ucount_max[type]);
-               if (!atomic_inc_below(&iter->ucount[type], max))
+               if (!atomic_long_inc_below(&iter->ucount[type], max))
                        goto fail;
        }
        return ucounts;
 fail:
        bad = iter;
        for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
-               atomic_dec(&iter->ucount[type]);
+               atomic_long_dec(&iter->ucount[type]);
 
        put_ucounts(ucounts);
        return NULL;
@@ -220,12 +242,54 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
 {
        struct ucounts *iter;
        for (iter = ucounts; iter; iter = iter->ns->ucounts) {
-               int dec = atomic_dec_if_positive(&iter->ucount[type]);
+               long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
                WARN_ON_ONCE(dec < 0);
        }
        put_ucounts(ucounts);
 }
 
+long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+       struct ucounts *iter;
+       long ret = 0;
+
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               long max = READ_ONCE(iter->ns->ucount_max[type]);
+               long new = atomic_long_add_return(v, &iter->ucount[type]);
+               if (new < 0 || new > max)
+                       ret = LONG_MAX;
+               else if (iter == ucounts)
+                       ret = new;
+       }
+       return ret;
+}
+
+bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
+{
+       struct ucounts *iter;
+       long new = -1; /* Silence compiler warning */
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               long dec = atomic_long_add_return(-v, &iter->ucount[type]);
+               WARN_ON_ONCE(dec < 0);
+               if (iter == ucounts)
+                       new = dec;
+       }
+       return (new == 0);
+}
+
+bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
+{
+       struct ucounts *iter;
+       if (get_ucounts_value(ucounts, type) > max)
+               return true;
+       for (iter = ucounts; iter; iter = iter->ns->ucounts) {
+               max = READ_ONCE(iter->ns->ucount_max[type]);
+               if (get_ucounts_value(iter, type) > max)
+                       return true;
+       }
+       return false;
+}
+
 static __init int user_namespace_sysctl_init(void)
 {
 #ifdef CONFIG_SYSCTL
@@ -241,6 +305,8 @@ static __init int user_namespace_sysctl_init(void)
        BUG_ON(!user_header);
        BUG_ON(!setup_userns_sysctls(&init_user_ns));
 #endif
+       hlist_add_ucounts(&init_ucounts);
+       inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
        return 0;
 }
 subsys_initcall(user_namespace_sysctl_init);
index a2478cd..c82399c 100644 (file)
@@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
 /* root_user.__count is 1, for init task cred */
 struct user_struct root_user = {
        .__count        = REFCOUNT_INIT(1),
-       .processes      = ATOMIC_INIT(1),
-       .sigpending     = ATOMIC_INIT(0),
-       .locked_shm     = 0,
        .uid            = GLOBAL_ROOT_UID,
        .ratelimit      = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
 };
index 8d62863..ef82d40 100644 (file)
@@ -119,9 +119,13 @@ int create_user_ns(struct cred *new)
        ns->owner = owner;
        ns->group = group;
        INIT_WORK(&ns->work, free_user_ns);
-       for (i = 0; i < UCOUNT_COUNTS; i++) {
+       for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
                ns->ucount_max[i] = INT_MAX;
        }
+       set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
+       set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
+       set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
+       set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
        ns->ucounts = ucounts;
 
        /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
@@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
        put_user_ns(cred->user_ns);
        set_cred_user_ns(cred, get_user_ns(user_ns));
 
+       if (set_cred_ucounts(cred) < 0)
+               return -EINVAL;
+
        return 0;
 }
 
index 2647c89..081dd33 100644 (file)
@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create,
        }
 
        if (flags & MFD_HUGETLB) {
-               struct user_struct *user = NULL;
+               struct ucounts *ucounts = NULL;
 
-               file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user,
+               file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
                                        HUGETLB_ANONHUGE_INODE,
                                        (flags >> MFD_HUGE_SHIFT) &
                                        MFD_HUGE_MASK);
index df590fd..e338ebc 100644 (file)
@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall)
  */
 static DEFINE_SPINLOCK(shmlock_user_lock);
 
-int user_shm_lock(size_t size, struct user_struct *user)
+int user_shm_lock(size_t size, struct ucounts *ucounts)
 {
        unsigned long lock_limit, locked;
+       long memlock;
        int allowed = 0;
 
        locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user)
                allowed = 1;
        lock_limit >>= PAGE_SHIFT;
        spin_lock(&shmlock_user_lock);
-       if (!allowed &&
-           locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+       memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+
+       if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
+               dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
+               goto out;
+       }
+       if (!get_ucounts(ucounts)) {
+               dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
                goto out;
-       get_uid(user);
-       user->locked_shm += locked;
+       }
        allowed = 1;
 out:
        spin_unlock(&shmlock_user_lock);
        return allowed;
 }
 
-void user_shm_unlock(size_t size, struct user_struct *user)
+void user_shm_unlock(size_t size, struct ucounts *ucounts)
 {
        spin_lock(&shmlock_user_lock);
-       user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
        spin_unlock(&shmlock_user_lock);
-       free_uid(user);
+       put_ucounts(ucounts);
 }
index 0584e54..bc88d16 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1611,7 +1611,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                        goto out_fput;
                }
        } else if (flags & MAP_HUGETLB) {
-               struct user_struct *user = NULL;
+               struct ucounts *ucounts = NULL;
                struct hstate *hs;
 
                hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
@@ -1627,7 +1627,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
                 */
                file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
                                VM_NORESERVE,
-                               &user, HUGETLB_ANONHUGE_INODE,
+                               &ucounts, HUGETLB_ANONHUGE_INODE,
                                (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
                if (IS_ERR(file))
                        return PTR_ERR(file);
index 5d46611..14997a9 100644 (file)
@@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
 }
 #endif
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
        struct inode *inode = file_inode(file);
        struct shmem_inode_info *info = SHMEM_I(inode);
@@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
         * no serialization needed when called from shm_destroy().
         */
        if (lock && !(info->flags & VM_LOCKED)) {
-               if (!user_shm_lock(inode->i_size, user))
+               if (!user_shm_lock(inode->i_size, ucounts))
                        goto out_nomem;
                info->flags |= VM_LOCKED;
                mapping_set_unevictable(file->f_mapping);
        }
-       if (!lock && (info->flags & VM_LOCKED) && user) {
-               user_shm_unlock(inode->i_size, user);
+       if (!lock && (info->flags & VM_LOCKED) && ucounts) {
+               user_shm_unlock(inode->i_size, ucounts);
                info->flags &= ~VM_LOCKED;
                mapping_clear_unevictable(file->f_mapping);
        }
@@ -4092,7 +4092,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
        return 0;
 }
 
-int shmem_lock(struct file *file, int lock, struct user_struct *user)
+int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
 {
        return 0;
 }
index bc3299a..fb010a3 100644 (file)
@@ -49,6 +49,7 @@ TARGETS += proc
 TARGETS += pstore
 TARGETS += ptrace
 TARGETS += openat2
+TARGETS += rlimits
 TARGETS += rseq
 TARGETS += rtc
 TARGETS += seccomp
diff --git a/tools/testing/selftests/rlimits/.gitignore b/tools/testing/selftests/rlimits/.gitignore
new file mode 100644 (file)
index 0000000..091021f
--- /dev/null
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: GPL-2.0-only
+rlimits-per-userns
diff --git a/tools/testing/selftests/rlimits/Makefile b/tools/testing/selftests/rlimits/Makefile
new file mode 100644 (file)
index 0000000..03aadb4
--- /dev/null
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+CFLAGS += -Wall -O2 -g
+TEST_GEN_PROGS := rlimits-per-userns
+
+include ../lib.mk
diff --git a/tools/testing/selftests/rlimits/config b/tools/testing/selftests/rlimits/config
new file mode 100644 (file)
index 0000000..416bd53
--- /dev/null
@@ -0,0 +1 @@
+CONFIG_USER_NS=y
diff --git a/tools/testing/selftests/rlimits/rlimits-per-userns.c b/tools/testing/selftests/rlimits/rlimits-per-userns.c
new file mode 100644 (file)
index 0000000..26dc949
--- /dev/null
@@ -0,0 +1,161 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Alexey Gladkov <gladkov.alexey@gmail.com>
+ */
+#define _GNU_SOURCE
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <sys/time.h>
+#include <sys/resource.h>
+#include <sys/prctl.h>
+#include <sys/stat.h>
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sched.h>
+#include <signal.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <err.h>
+
+#define NR_CHILDS 2
+
+static char *service_prog;
+static uid_t user   = 60000;
+static uid_t group  = 60000;
+
+static void setrlimit_nproc(rlim_t n)
+{
+       pid_t pid = getpid();
+       struct rlimit limit = {
+               .rlim_cur = n,
+               .rlim_max = n
+       };
+
+       warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n);
+
+       if (setrlimit(RLIMIT_NPROC, &limit) < 0)
+               err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
+}
+
+static pid_t fork_child(void)
+{
+       pid_t pid = fork();
+
+       if (pid < 0)
+               err(EXIT_FAILURE, "fork");
+
+       if (pid > 0)
+               return pid;
+
+       pid = getpid();
+
+       warnx("(pid=%d): New process starting ...", pid);
+
+       if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
+               err(EXIT_FAILURE, "(pid=%d): prctl(PR_SET_PDEATHSIG)", pid);
+
+       signal(SIGUSR1, SIG_DFL);
+
+       warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group);
+
+       if (setgid(group) < 0)
+               err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group);
+       if (setuid(user) < 0)
+               err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user);
+
+       warnx("(pid=%d): Service running ...", pid);
+
+       warnx("(pid=%d): Unshare user namespace", pid);
+       if (unshare(CLONE_NEWUSER) < 0)
+               err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
+
+       char *const argv[] = { "service", NULL };
+       char *const envp[] = { "I_AM_SERVICE=1", NULL };
+
+       warnx("(pid=%d): Executing real service ...", pid);
+
+       execve(service_prog, argv, envp);
+       err(EXIT_FAILURE, "(pid=%d): execve", pid);
+}
+
+int main(int argc, char **argv)
+{
+       size_t i;
+       pid_t child[NR_CHILDS];
+       int wstatus[NR_CHILDS];
+       int childs = NR_CHILDS;
+       pid_t pid;
+
+       if (getenv("I_AM_SERVICE")) {
+               pause();
+               exit(EXIT_SUCCESS);
+       }
+
+       service_prog = argv[0];
+       pid = getpid();
+
+       warnx("(pid=%d) Starting testcase", pid);
+
+       /*
+        * This rlimit is not a problem for root because it can be exceeded.
+        */
+       setrlimit_nproc(1);
+
+       for (i = 0; i < NR_CHILDS; i++) {
+               child[i] = fork_child();
+               wstatus[i] = 0;
+               usleep(250000);
+       }
+
+       while (1) {
+               for (i = 0; i < NR_CHILDS; i++) {
+                       if (child[i] <= 0)
+                               continue;
+
+                       errno = 0;
+                       pid_t ret = waitpid(child[i], &wstatus[i], WNOHANG);
+
+                       if (!ret || (!WIFEXITED(wstatus[i]) && !WIFSIGNALED(wstatus[i])))
+                               continue;
+
+                       if (ret < 0 && errno != ECHILD)
+                               warn("(pid=%d): waitpid(%d)", pid, child[i]);
+
+                       child[i] *= -1;
+                       childs -= 1;
+               }
+
+               if (!childs)
+                       break;
+
+               usleep(250000);
+
+               for (i = 0; i < NR_CHILDS; i++) {
+                       if (child[i] <= 0)
+                               continue;
+                       kill(child[i], SIGUSR1);
+               }
+       }
+
+       for (i = 0; i < NR_CHILDS; i++) {
+               if (WIFEXITED(wstatus[i]))
+                       warnx("(pid=%d): pid %d exited, status=%d",
+                               pid, -child[i], WEXITSTATUS(wstatus[i]));
+               else if (WIFSIGNALED(wstatus[i]))
+                       warnx("(pid=%d): pid %d killed by signal %d",
+                               pid, -child[i], WTERMSIG(wstatus[i]));
+
+               if (WIFSIGNALED(wstatus[i]) && WTERMSIG(wstatus[i]) == SIGUSR1)
+                       continue;
+
+               warnx("(pid=%d): Test failed", pid);
+               exit(EXIT_FAILURE);
+       }
+
+       warnx("(pid=%d): Test passed", pid);
+       exit(EXIT_SUCCESS);
+}