Merge branch 'for-5.16' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst

index 81d37ac..2aeb7ae 100644 (file)
--- a/Documentation/admin-guide/cgroup-v2.rst
+++ b/Documentation/admin-guide/cgroup-v2.rst
@@ -2318,6 +2318,16 @@ Miscellaneous controller provides 3 interface files. If two misc resources (res_
          Limits can be set higher than the capacity value in the misc.capacity
          file.
  
+  misc.events
+       A read-only flat-keyed file which exists on non-root cgroups. The
+       following entries are defined. Unless specified otherwise, a value
+       change in this file generates a file modified event. All fields in
+       this file are hierarchical.
+
+         max
+               The number of times the cgroup's resource usage was
+               about to go over the max boundary.
+
  Migration and Ownership
  ~~~~~~~~~~~~~~~~~~~~~~~
  
diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h

index 3536ab4..11820a4 100644 (file)
--- a/include/linux/bpf-cgroup.h
+++ b/include/linux/bpf-cgroup.h
@@ -157,26 +157,6 @@ struct cgroup_bpf {
  int cgroup_bpf_inherit(struct cgroup *cgrp);
  void cgroup_bpf_offline(struct cgroup *cgrp);
  
-int __cgroup_bpf_attach(struct cgroup *cgrp,
-                       struct bpf_prog *prog, struct bpf_prog *replace_prog,
-                       struct bpf_cgroup_link *link,
-                       enum bpf_attach_type type, u32 flags);
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                       struct bpf_cgroup_link *link,
-                       enum bpf_attach_type type);
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-                      union bpf_attr __user *uattr);
-
-/* Wrapper for __cgroup_bpf_*() protected by cgroup_mutex */
-int cgroup_bpf_attach(struct cgroup *cgrp,
-                     struct bpf_prog *prog, struct bpf_prog *replace_prog,
-                     struct bpf_cgroup_link *link, enum bpf_attach_type type,
-                     u32 flags);
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                     enum bpf_attach_type type);
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-                    union bpf_attr __user *uattr);
-
  int __cgroup_bpf_run_filter_skb(struct sock *sk,
                                 struct sk_buff *skb,
                                 enum cgroup_bpf_attach_type atype);
diff --git a/include/linux/misc_cgroup.h b/include/linux/misc_cgroup.h

index da2367e..c238207 100644 (file)
--- a/include/linux/misc_cgroup.h
+++ b/include/linux/misc_cgroup.h
@@ -36,7 +36,7 @@ struct misc_cg;
  struct misc_res {
         unsigned long max;
         atomic_long_t usage;
-       bool failed;
+       atomic_long_t events;
  };
  
  /**
@@ -46,6 +46,10 @@ struct misc_res {
   */
  struct misc_cg {
         struct cgroup_subsys_state css;
+
+       /* misc.events */
+       struct cgroup_file events_file;
+
         struct misc_res res[MISC_CG_RES_TYPES];
  };
  
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c

index 03145d4..2ca643a 100644 (file)
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -430,10 +430,10 @@ static struct bpf_prog_list *find_attach_entry(struct list_head *progs,
   * Exactly one of @prog or @link can be non-null.
   * Must be called with cgroup_mutex held.
   */
-int __cgroup_bpf_attach(struct cgroup *cgrp,
-                       struct bpf_prog *prog, struct bpf_prog *replace_prog,
-                       struct bpf_cgroup_link *link,
-                       enum bpf_attach_type type, u32 flags)
+static int __cgroup_bpf_attach(struct cgroup *cgrp,
+                              struct bpf_prog *prog, struct bpf_prog *replace_prog,
+                              struct bpf_cgroup_link *link,
+                              enum bpf_attach_type type, u32 flags)
  {
         u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
         struct bpf_prog *old_prog = NULL;
@@ -523,6 +523,20 @@ cleanup:
         return err;
  }
  
+static int cgroup_bpf_attach(struct cgroup *cgrp,
+                            struct bpf_prog *prog, struct bpf_prog *replace_prog,
+                            struct bpf_cgroup_link *link,
+                            enum bpf_attach_type type,
+                            u32 flags)
+{
+       int ret;
+
+       mutex_lock(&cgroup_mutex);
+       ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+       mutex_unlock(&cgroup_mutex);
+       return ret;
+}
+
  /* Swap updated BPF program for given link in effective program arrays across
   * all descendant cgroups. This function is guaranteed to succeed.
   */
@@ -672,14 +686,14 @@ static struct bpf_prog_list *find_detach_entry(struct list_head *progs,
   *                         propagate the change to descendants
   * @cgrp: The cgroup which descendants to traverse
   * @prog: A program to detach or NULL
- * @prog: A link to detach or NULL
+ * @link: A link to detach or NULL
   * @type: Type of detach operation
   *
   * At most one of @prog or @link can be non-NULL.
   * Must be called with cgroup_mutex held.
   */
-int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                       struct bpf_cgroup_link *link, enum bpf_attach_type type)
+static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+                              struct bpf_cgroup_link *link, enum bpf_attach_type type)
  {
         enum cgroup_bpf_attach_type atype;
         struct bpf_prog *old_prog;
@@ -730,9 +744,20 @@ cleanup:
         return err;
  }
  
+static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
+                            enum bpf_attach_type type)
+{
+       int ret;
+
+       mutex_lock(&cgroup_mutex);
+       ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+       mutex_unlock(&cgroup_mutex);
+       return ret;
+}
+
  /* Must be called with cgroup_mutex held to avoid races. */
-int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-                      union bpf_attr __user *uattr)
+static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+                             union bpf_attr __user *uattr)
  {
         __u32 __user *prog_ids = u64_to_user_ptr(attr->query.prog_ids);
         enum bpf_attach_type type = attr->query.attach_type;
@@ -789,6 +814,17 @@ int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
         return ret;
  }
  
+static int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
+                           union bpf_attr __user *uattr)
+{
+       int ret;
+
+       mutex_lock(&cgroup_mutex);
+       ret = __cgroup_bpf_query(cgrp, attr, uattr);
+       mutex_unlock(&cgroup_mutex);
+       return ret;
+}
+
  int cgroup_bpf_prog_attach(const union bpf_attr *attr,
                            enum bpf_prog_type ptype, struct bpf_prog *prog)
  {
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c

index 35b9203..81c9e06 100644 (file)
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -63,9 +63,6 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
         for_each_root(root) {
                 struct cgroup *from_cgrp;
  
-               if (root == &cgrp_dfl_root)
-                       continue;
-
                 spin_lock_irq(&css_set_lock);
                 from_cgrp = task_cgroup_from_root(from, root);
                 spin_unlock_irq(&css_set_lock);
@@ -662,11 +659,9 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
  
         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
         /*
-        * ideally we don't want subsystems moving around while we do this.
-        * cgroup_mutex is also necessary to guarantee an atomic snapshot of
-        * subsys/hierarchy state.
+        * Grab the subsystems state racily. No need to add avenue to
+        * cgroup_mutex contention.
          */
-       mutex_lock(&cgroup_mutex);
  
         for_each_subsys(ss, i)
                 seq_printf(m, "%s\t%d\t%d\t%d\n",
@@ -674,7 +669,6 @@ int proc_cgroupstats_show(struct seq_file *m, void *v)
                            atomic_read(&ss->root->nr_cgrps),
                            cgroup_ssid_enabled(i));
  
-       mutex_unlock(&cgroup_mutex);
         return 0;
  }
  
@@ -701,8 +695,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
             kernfs_type(kn) != KERNFS_DIR)
                 return -EINVAL;
  
-       mutex_lock(&cgroup_mutex);
-
         /*
          * We aren't being called from kernfs and there's no guarantee on
          * @kn->priv's validity.  For this and css_tryget_online_from_dir(),
@@ -710,9 +702,8 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
          */
         rcu_read_lock();
         cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
-       if (!cgrp || cgroup_is_dead(cgrp)) {
+       if (!cgrp || !cgroup_tryget(cgrp)) {
                 rcu_read_unlock();
-               mutex_unlock(&cgroup_mutex);
                 return -ENOENT;
         }
         rcu_read_unlock();
@@ -740,7 +731,7 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
         }
         css_task_iter_end(&it);
  
-       mutex_unlock(&cgroup_mutex);
+       cgroup_put(cgrp);
         return 0;
  }
  
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c

index ea08f01..919194d 100644 (file)
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -1740,6 +1740,7 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
         struct cgroup *dcgrp = &dst_root->cgrp;
         struct cgroup_subsys *ss;
         int ssid, i, ret;
+       u16 dfl_disable_ss_mask = 0;
  
         lockdep_assert_held(&cgroup_mutex);
  
@@ -1756,8 +1757,28 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
                 /* can't move between two non-dummy roots either */
                 if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root)
                         return -EBUSY;
+
+               /*
+                * Collect ssid's that need to be disabled from default
+                * hierarchy.
+                */
+               if (ss->root == &cgrp_dfl_root)
+                       dfl_disable_ss_mask |= 1 << ssid;
+
         } while_each_subsys_mask();
  
+       if (dfl_disable_ss_mask) {
+               struct cgroup *scgrp = &cgrp_dfl_root.cgrp;
+
+               /*
+                * Controllers from default hierarchy that need to be rebound
+                * are all disabled together in one go.
+                */
+               cgrp_dfl_root.subsys_mask &= ~dfl_disable_ss_mask;
+               WARN_ON(cgroup_apply_control(scgrp));
+               cgroup_finalize_control(scgrp, 0);
+       }
+
         do_each_subsys_mask(ss, ssid, ss_mask) {
                 struct cgroup_root *src_root = ss->root;
                 struct cgroup *scgrp = &src_root->cgrp;
@@ -1766,10 +1787,12 @@ int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask)
  
                 WARN_ON(!css || cgroup_css(dcgrp, ss));
  
-               /* disable from the source */
-               src_root->subsys_mask &= ~(1 << ssid);
-               WARN_ON(cgroup_apply_control(scgrp));
-               cgroup_finalize_control(scgrp, 0);
+               if (src_root != &cgrp_dfl_root) {
+                       /* disable from the source */
+                       src_root->subsys_mask &= ~(1 << ssid);
+                       WARN_ON(cgroup_apply_control(scgrp));
+                       cgroup_finalize_control(scgrp, 0);
+               }
  
                 /* rebind */
                 RCU_INIT_POINTER(scgrp->subsys[ssid], NULL);
@@ -5911,17 +5934,20 @@ struct cgroup *cgroup_get_from_id(u64 id)
         struct kernfs_node *kn;
         struct cgroup *cgrp = NULL;
  
-       mutex_lock(&cgroup_mutex);
         kn = kernfs_find_and_get_node_by_id(cgrp_dfl_root.kf_root, id);
         if (!kn)
-               goto out_unlock;
+               goto out;
  
-       cgrp = kn->priv;
-       if (cgroup_is_dead(cgrp) || !cgroup_tryget(cgrp))
+       rcu_read_lock();
+
+       cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+       if (cgrp && !cgroup_tryget(cgrp))
                 cgrp = NULL;
+
+       rcu_read_unlock();
+
         kernfs_put(kn);
-out_unlock:
-       mutex_unlock(&cgroup_mutex);
+out:
         return cgrp;
  }
  EXPORT_SYMBOL_GPL(cgroup_get_from_id);
@@ -6474,30 +6500,34 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss)
   *
   * Find the cgroup at @path on the default hierarchy, increment its
   * reference count and return it.  Returns pointer to the found cgroup on
- * success, ERR_PTR(-ENOENT) if @path doesn't exist and ERR_PTR(-ENOTDIR)
- * if @path points to a non-directory.
+ * success, ERR_PTR(-ENOENT) if @path doesn't exist or if the cgroup has already
+ * been released and ERR_PTR(-ENOTDIR) if @path points to a non-directory.
   */
  struct cgroup *cgroup_get_from_path(const char *path)
  {
         struct kernfs_node *kn;
-       struct cgroup *cgrp;
-
-       mutex_lock(&cgroup_mutex);
+       struct cgroup *cgrp = ERR_PTR(-ENOENT);
  
         kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path);
-       if (kn) {
-               if (kernfs_type(kn) == KERNFS_DIR) {
-                       cgrp = kn->priv;
-                       cgroup_get_live(cgrp);
-               } else {
-                       cgrp = ERR_PTR(-ENOTDIR);
-               }
-               kernfs_put(kn);
-       } else {
-               cgrp = ERR_PTR(-ENOENT);
+       if (!kn)
+               goto out;
+
+       if (kernfs_type(kn) != KERNFS_DIR) {
+               cgrp = ERR_PTR(-ENOTDIR);
+               goto out_kernfs;
         }
  
-       mutex_unlock(&cgroup_mutex);
+       rcu_read_lock();
+
+       cgrp = rcu_dereference(*(void __rcu __force **)&kn->priv);
+       if (!cgrp || !cgroup_tryget(cgrp))
+               cgrp = ERR_PTR(-ENOENT);
+
+       rcu_read_unlock();
+
+out_kernfs:
+       kernfs_put(kn);
+out:
         return cgrp;
  }
  EXPORT_SYMBOL_GPL(cgroup_get_from_path);
@@ -6625,44 +6655,6 @@ void cgroup_sk_free(struct sock_cgroup_data *skcd)
  
  #endif /* CONFIG_SOCK_CGROUP_DATA */
  
-#ifdef CONFIG_CGROUP_BPF
-int cgroup_bpf_attach(struct cgroup *cgrp,
-                     struct bpf_prog *prog, struct bpf_prog *replace_prog,
-                     struct bpf_cgroup_link *link,
-                     enum bpf_attach_type type,
-                     u32 flags)
-{
-       int ret;
-
-       mutex_lock(&cgroup_mutex);
-       ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
-       mutex_unlock(&cgroup_mutex);
-       return ret;
-}
-
-int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                     enum bpf_attach_type type)
-{
-       int ret;
-
-       mutex_lock(&cgroup_mutex);
-       ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
-       mutex_unlock(&cgroup_mutex);
-       return ret;
-}
-
-int cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
-                    union bpf_attr __user *uattr)
-{
-       int ret;
-
-       mutex_lock(&cgroup_mutex);
-       ret = __cgroup_bpf_query(cgrp, attr, uattr);
-       mutex_unlock(&cgroup_mutex);
-       return ret;
-}
-#endif /* CONFIG_CGROUP_BPF */
-
  #ifdef CONFIG_SYSFS
  static ssize_t show_delegatable_files(struct cftype *files, char *buf,
                                       ssize_t size, const char *prefix)
diff --git a/kernel/cgroup/misc.c b/kernel/cgroup/misc.c

index ec02d96..fe3e8a0 100644 (file)
--- a/kernel/cgroup/misc.c
+++ b/kernel/cgroup/misc.c
@@ -157,13 +157,6 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
                 new_usage = atomic_long_add_return(amount, &res->usage);
                 if (new_usage > READ_ONCE(res->max) ||
                     new_usage > READ_ONCE(misc_res_capacity[type])) {
-                       if (!res->failed) {
-                               pr_info("cgroup: charge rejected by the misc controller for %s resource in ",
-                                       misc_res_name[type]);
-                               pr_cont_cgroup_path(i->css.cgroup);
-                               pr_cont("\n");
-                               res->failed = true;
-                       }
                         ret = -EBUSY;
                         goto err_charge;
                 }
@@ -171,6 +164,11 @@ int misc_cg_try_charge(enum misc_res_type type, struct misc_cg *cg,
         return 0;
  
  err_charge:
+       for (j = i; j; j = parent_misc(j)) {
+               atomic_long_inc(&j->res[type].events);
+               cgroup_file_notify(&j->events_file);
+       }
+
         for (j = cg; j != i; j = parent_misc(j))
                 misc_cg_cancel_charge(type, j, amount);
         misc_cg_cancel_charge(type, i, amount);
@@ -335,6 +333,19 @@ static int misc_cg_capacity_show(struct seq_file *sf, void *v)
         return 0;
  }
  
+static int misc_events_show(struct seq_file *sf, void *v)
+{
+       struct misc_cg *cg = css_misc(seq_css(sf));
+       unsigned long events, i;
+
+       for (i = 0; i < MISC_CG_RES_TYPES; i++) {
+               events = atomic_long_read(&cg->res[i].events);
+               if (READ_ONCE(misc_res_capacity[i]) || events)
+                       seq_printf(sf, "%s.max %lu\n", misc_res_name[i], events);
+       }
+       return 0;
+}
+
  /* Misc cgroup interface files */
  static struct cftype misc_cg_files[] = {
         {
@@ -353,6 +364,12 @@ static struct cftype misc_cg_files[] = {
                 .seq_show = misc_cg_capacity_show,
                 .flags = CFTYPE_ONLY_ON_ROOT,
         },
+       {
+               .name = "events",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .file_offset = offsetof(struct misc_cg, events_file),
+               .seq_show = misc_events_show,
+       },
         {}
  };
  
diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c

index b264ab5..1486768 100644 (file)
--- a/kernel/cgroup/rstat.c
+++ b/kernel/cgroup/rstat.c
@@ -433,8 +433,6 @@ static void root_cgroup_cputime(struct task_cputime *cputime)
                 cputime->sum_exec_runtime += user;
                 cputime->sum_exec_runtime += sys;
                 cputime->sum_exec_runtime += cpustat[CPUTIME_STEAL];
-               cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST];
-               cputime->sum_exec_runtime += cpustat[CPUTIME_GUEST_NICE];
         }
  }
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 2 Nov 2021 22:37:27 +0000 (15:37 -0700)
Documentation/admin-guide/cgroup-v2.rst		patch \| blob \| history
include/linux/bpf-cgroup.h		patch \| blob \| history
include/linux/misc_cgroup.h		patch \| blob \| history
kernel/bpf/cgroup.c		patch \| blob \| history
kernel/cgroup/cgroup-v1.c		patch \| blob \| history
kernel/cgroup/cgroup.c		patch \| blob \| history
kernel/cgroup/misc.c		patch \| blob \| history
kernel/cgroup/rstat.c		patch \| blob \| history