Task Control Groups: automatic userspace notification of idle cgroups
authorPaul Menage <menage@google.com>
Fri, 19 Oct 2007 06:39:38 +0000 (23:39 -0700)
committerLinus Torvalds <torvalds@woody.linux-foundation.org>
Fri, 19 Oct 2007 18:53:36 +0000 (11:53 -0700)
Add the following files to the cgroup filesystem:

notify_on_release - configures/reports whether the cgroup subsystem should
attempt to run a release script when this cgroup becomes unused

release_agent - configures/reports the release agent to be used for this
hierarchy (top level in each hierarchy only)

releasable - reports whether this cgroup would have been auto-released if
notify_on_release was true and a release agent was configured (mainly useful
for debugging)

To avoid locking issues, invoking the userspace release agent is done via a
workqueue task; cgroups that need to have their release agents invoked by
the workqueue task are linked on to a list.

[pj@sgi.com: Need to include kmod.h]
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/cgroup.h
kernel/cgroup.c

index 836b355..9e9b7ef 100644 (file)
@@ -77,10 +77,11 @@ static inline void css_get(struct cgroup_subsys_state *css)
  * css_get()
  */
 
+extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
        if (!test_bit(CSS_ROOT, &css->flags))
-               atomic_dec(&css->refcnt);
+               __css_put(css);
 }
 
 struct cgroup {
@@ -112,6 +113,13 @@ struct cgroup {
         * tasks in this cgroup. Protected by css_set_lock
         */
        struct list_head css_sets;
+
+       /*
+        * Linked list running through all cgroups that can
+        * potentially be reaped by the release agent. Protected by
+        * release_list_lock
+        */
+       struct list_head release_list;
 };
 
 /* A css_set is a structure holding pointers to a set of
@@ -293,7 +301,6 @@ struct task_struct *cgroup_iter_next(struct cgroup *cont,
                                        struct cgroup_iter *it);
 void cgroup_iter_end(struct cgroup *cont, struct cgroup_iter *it);
 
-
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
index 883928c..d65a124 100644 (file)
 #include <linux/spinlock.h>
 #include <linux/string.h>
 #include <linux/sort.h>
+#include <linux/kmod.h>
 #include <asm/atomic.h>
 
+static DEFINE_MUTEX(cgroup_mutex);
+
 /* Generate an array of cgroup subsystem pointers */
 #define SUBSYS(_x) &_x ## _subsys,
 
@@ -83,6 +86,13 @@ struct cgroupfs_root {
 
        /* Hierarchy-specific flags */
        unsigned long flags;
+
+       /* The path to use for release notifications. No locking
+        * between setting and use - so if userspace updates this
+        * while child cgroups exist, you could miss a
+        * notification. We ensure that it's always a valid
+        * NUL-terminated string */
+       char release_agent_path[PATH_MAX];
 };
 
 
@@ -110,7 +120,13 @@ static int need_forkexit_callback;
 
 /* bits in struct cgroup flags field */
 enum {
+       /* Control Group is dead */
        CONT_REMOVED,
+       /* Control Group has previously had a child cgroup or a task,
+        * but no longer (only if CONT_NOTIFY_ON_RELEASE is set) */
+       CONT_RELEASABLE,
+       /* Control Group requires release notifications to userspace */
+       CONT_NOTIFY_ON_RELEASE,
 };
 
 /* convenient tests for these bits */
@@ -124,6 +140,19 @@ enum {
        ROOT_NOPREFIX, /* mounted subsystems have no named prefix */
 };
 
+inline int cgroup_is_releasable(const struct cgroup *cont)
+{
+       const int bits =
+               (1 << CONT_RELEASABLE) |
+               (1 << CONT_NOTIFY_ON_RELEASE);
+       return (cont->flags & bits) == bits;
+}
+
+inline int notify_on_release(const struct cgroup *cont)
+{
+       return test_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+}
+
 /*
  * for_each_subsys() allows you to iterate on each subsystem attached to
  * an active hierarchy
@@ -135,6 +164,14 @@ list_for_each_entry(_ss, &_root->subsys_list, sibling)
 #define for_each_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
+/* the list of cgroups eligible for automatic release. Protected by
+ * release_list_lock */
+static LIST_HEAD(release_list);
+static DEFINE_SPINLOCK(release_list_lock);
+static void cgroup_release_agent(struct work_struct *work);
+static DECLARE_WORK(release_agent_work, cgroup_release_agent);
+static void check_for_release(struct cgroup *cont);
+
 /* Link structure for associating css_set objects with cgroups */
 struct cg_cgroup_link {
        /*
@@ -189,11 +226,8 @@ static int use_task_css_set_links;
 /*
  * unlink a css_set from the list and free it
  */
-static void release_css_set(struct kref *k)
+static void unlink_css_set(struct css_set *cg)
 {
-       struct css_set *cg = container_of(k, struct css_set, ref);
-       int i;
-
        write_lock(&css_set_lock);
        list_del(&cg->list);
        css_set_count--;
@@ -206,11 +240,39 @@ static void release_css_set(struct kref *k)
                kfree(link);
        }
        write_unlock(&css_set_lock);
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++)
-               atomic_dec(&cg->subsys[i]->cgroup->count);
+}
+
+static void __release_css_set(struct kref *k, int taskexit)
+{
+       int i;
+       struct css_set *cg = container_of(k, struct css_set, ref);
+
+       unlink_css_set(cg);
+
+       rcu_read_lock();
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup *cont = cg->subsys[i]->cgroup;
+               if (atomic_dec_and_test(&cont->count) &&
+                   notify_on_release(cont)) {
+                       if (taskexit)
+                               set_bit(CONT_RELEASABLE, &cont->flags);
+                       check_for_release(cont);
+               }
+       }
+       rcu_read_unlock();
        kfree(cg);
 }
 
+static void release_css_set(struct kref *k)
+{
+       __release_css_set(k, 0);
+}
+
+static void release_css_set_taskexit(struct kref *k)
+{
+       __release_css_set(k, 1);
+}
+
 /*
  * refcounted get/put for css_set objects
  */
@@ -224,6 +286,11 @@ static inline void put_css_set(struct css_set *cg)
        kref_put(&cg->ref, release_css_set);
 }
 
+static inline void put_css_set_taskexit(struct css_set *cg)
+{
+       kref_put(&cg->ref, release_css_set_taskexit);
+}
+
 /*
  * find_existing_css_set() is a helper for
  * find_css_set(), and checks to see whether an existing
@@ -465,8 +532,6 @@ static struct css_set *find_css_set(
  * update of a tasks cgroup pointer by attach_task()
  */
 
-static DEFINE_MUTEX(cgroup_mutex);
-
 /**
  * cgroup_lock - lock out any changes to cgroup structures
  *
@@ -526,6 +591,13 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
        if (S_ISDIR(inode->i_mode)) {
                struct cgroup *cont = dentry->d_fsdata;
                BUG_ON(!(cgroup_is_removed(cont)));
+               /* It's possible for external users to be holding css
+                * reference counts on a cgroup; css_put() needs to
+                * be able to access the cgroup after decrementing
+                * the reference count in order to know if it needs to
+                * queue the cgroup to be handled by the release
+                * agent */
+               synchronize_rcu();
                kfree(cont);
        }
        iput(inode);
@@ -657,6 +729,8 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_printf(seq, ",%s", ss->name);
        if (test_bit(ROOT_NOPREFIX, &root->flags))
                seq_puts(seq, ",noprefix");
+       if (strlen(root->release_agent_path))
+               seq_printf(seq, ",release_agent=%s", root->release_agent_path);
        mutex_unlock(&cgroup_mutex);
        return 0;
 }
@@ -664,6 +738,7 @@ static int cgroup_show_options(struct seq_file *seq, struct vfsmount *vfs)
 struct cgroup_sb_opts {
        unsigned long subsys_bits;
        unsigned long flags;
+       char *release_agent;
 };
 
 /* Convert a hierarchy specifier into a bitmask of subsystems and
@@ -675,6 +750,7 @@ static int parse_cgroupfs_options(char *data,
 
        opts->subsys_bits = 0;
        opts->flags = 0;
+       opts->release_agent = NULL;
 
        while ((token = strsep(&o, ",")) != NULL) {
                if (!*token)
@@ -683,6 +759,15 @@ static int parse_cgroupfs_options(char *data,
                        opts->subsys_bits = (1 << CGROUP_SUBSYS_COUNT) - 1;
                } else if (!strcmp(token, "noprefix")) {
                        set_bit(ROOT_NOPREFIX, &opts->flags);
+               } else if (!strncmp(token, "release_agent=", 14)) {
+                       /* Specifying two release agents is forbidden */
+                       if (opts->release_agent)
+                               return -EINVAL;
+                       opts->release_agent = kzalloc(PATH_MAX, GFP_KERNEL);
+                       if (!opts->release_agent)
+                               return -ENOMEM;
+                       strncpy(opts->release_agent, token + 14, PATH_MAX - 1);
+                       opts->release_agent[PATH_MAX - 1] = 0;
                } else {
                        struct cgroup_subsys *ss;
                        int i;
@@ -732,7 +817,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
        if (!ret)
                cgroup_populate_dir(cont);
 
+       if (opts.release_agent)
+               strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
+       if (opts.release_agent)
+               kfree(opts.release_agent);
        mutex_unlock(&cgroup_mutex);
        mutex_unlock(&cont->dentry->d_inode->i_mutex);
        return ret;
@@ -756,6 +845,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        INIT_LIST_HEAD(&cont->sibling);
        INIT_LIST_HEAD(&cont->children);
        INIT_LIST_HEAD(&cont->css_sets);
+       INIT_LIST_HEAD(&cont->release_list);
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -830,8 +920,11 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 
        /* First find the desired set of subsystems */
        ret = parse_cgroupfs_options(data, &opts);
-       if (ret)
+       if (ret) {
+               if (opts.release_agent)
+                       kfree(opts.release_agent);
                return ret;
+       }
 
        root = kzalloc(sizeof(*root), GFP_KERNEL);
        if (!root)
@@ -840,6 +933,10 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
        init_cgroup_root(root);
        root->subsys_bits = opts.subsys_bits;
        root->flags = opts.flags;
+       if (opts.release_agent) {
+               strcpy(root->release_agent_path, opts.release_agent);
+               kfree(opts.release_agent);
+       }
 
        sb = sget(fs_type, cgroup_test_super, cgroup_set_super, root);
 
@@ -1120,7 +1217,7 @@ static int attach_task(struct cgroup *cont, struct task_struct *tsk)
                        ss->attach(ss, cont, oldcont, tsk);
                }
        }
-
+       set_bit(CONT_RELEASABLE, &oldcont->flags);
        synchronize_rcu();
        put_css_set(cg);
        return 0;
@@ -1170,6 +1267,9 @@ enum cgroup_filetype {
        FILE_ROOT,
        FILE_DIR,
        FILE_TASKLIST,
+       FILE_NOTIFY_ON_RELEASE,
+       FILE_RELEASABLE,
+       FILE_RELEASE_AGENT,
 };
 
 static ssize_t cgroup_write_uint(struct cgroup *cont, struct cftype *cft,
@@ -1240,6 +1340,32 @@ static ssize_t cgroup_common_file_write(struct cgroup *cont,
        case FILE_TASKLIST:
                retval = attach_task_by_pid(cont, buffer);
                break;
+       case FILE_NOTIFY_ON_RELEASE:
+               clear_bit(CONT_RELEASABLE, &cont->flags);
+               if (simple_strtoul(buffer, NULL, 10) != 0)
+                       set_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+               else
+                       clear_bit(CONT_NOTIFY_ON_RELEASE, &cont->flags);
+               break;
+       case FILE_RELEASE_AGENT:
+       {
+               struct cgroupfs_root *root = cont->root;
+               /* Strip trailing newline */
+               if (nbytes && (buffer[nbytes-1] == '\n')) {
+                       buffer[nbytes-1] = 0;
+               }
+               if (nbytes < sizeof(root->release_agent_path)) {
+                       /* We never write anything other than '\0'
+                        * into the last char of release_agent_path,
+                        * so it always remains a NUL-terminated
+                        * string */
+                       strncpy(root->release_agent_path, buffer, nbytes);
+                       root->release_agent_path[nbytes] = 0;
+               } else {
+                       retval = -ENOSPC;
+               }
+               break;
+       }
        default:
                retval = -EINVAL;
                goto out2;
@@ -1281,6 +1407,49 @@ static ssize_t cgroup_read_uint(struct cgroup *cont, struct cftype *cft,
        return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
 }
 
+static ssize_t cgroup_common_file_read(struct cgroup *cont,
+                                         struct cftype *cft,
+                                         struct file *file,
+                                         char __user *buf,
+                                         size_t nbytes, loff_t *ppos)
+{
+       enum cgroup_filetype type = cft->private;
+       char *page;
+       ssize_t retval = 0;
+       char *s;
+
+       if (!(page = (char *)__get_free_page(GFP_KERNEL)))
+               return -ENOMEM;
+
+       s = page;
+
+       switch (type) {
+       case FILE_RELEASE_AGENT:
+       {
+               struct cgroupfs_root *root;
+               size_t n;
+               mutex_lock(&cgroup_mutex);
+               root = cont->root;
+               n = strnlen(root->release_agent_path,
+                           sizeof(root->release_agent_path));
+               n = min(n, (size_t) PAGE_SIZE);
+               strncpy(s, root->release_agent_path, n);
+               mutex_unlock(&cgroup_mutex);
+               s += n;
+               break;
+       }
+       default:
+               retval = -EINVAL;
+               goto out;
+       }
+       *s++ = '\n';
+
+       retval = simple_read_from_buffer(buf, nbytes, ppos, page, s - page);
+out:
+       free_page((unsigned long)page);
+       return retval;
+}
+
 static ssize_t cgroup_file_read(struct file *file, char __user *buf,
                                   size_t nbytes, loff_t *ppos)
 {
@@ -1699,16 +1868,49 @@ static int cgroup_tasks_release(struct inode *unused_inode,
        return 0;
 }
 
+static u64 cgroup_read_notify_on_release(struct cgroup *cont,
+                                           struct cftype *cft)
+{
+       return notify_on_release(cont);
+}
+
+static u64 cgroup_read_releasable(struct cgroup *cont, struct cftype *cft)
+{
+       return test_bit(CONT_RELEASABLE, &cont->flags);
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
-static struct cftype cft_tasks = {
-       .name = "tasks",
-       .open = cgroup_tasks_open,
-       .read = cgroup_tasks_read,
+static struct cftype files[] = {
+       {
+               .name = "tasks",
+               .open = cgroup_tasks_open,
+               .read = cgroup_tasks_read,
+               .write = cgroup_common_file_write,
+               .release = cgroup_tasks_release,
+               .private = FILE_TASKLIST,
+       },
+
+       {
+               .name = "notify_on_release",
+               .read_uint = cgroup_read_notify_on_release,
+               .write = cgroup_common_file_write,
+               .private = FILE_NOTIFY_ON_RELEASE,
+       },
+
+       {
+               .name = "releasable",
+               .read_uint = cgroup_read_releasable,
+               .private = FILE_RELEASABLE,
+       }
+};
+
+static struct cftype cft_release_agent = {
+       .name = "release_agent",
+       .read = cgroup_common_file_read,
        .write = cgroup_common_file_write,
-       .release = cgroup_tasks_release,
-       .private = FILE_TASKLIST,
+       .private = FILE_RELEASE_AGENT,
 };
 
 static int cgroup_populate_dir(struct cgroup *cont)
@@ -1719,10 +1921,15 @@ static int cgroup_populate_dir(struct cgroup *cont)
        /* First clear out any existing files */
        cgroup_clear_directory(cont->dentry);
 
-       err = cgroup_add_file(cont, NULL, &cft_tasks);
+       err = cgroup_add_files(cont, NULL, files, ARRAY_SIZE(files));
        if (err < 0)
                return err;
 
+       if (cont == cont->top_cgroup) {
+               if ((err = cgroup_add_file(cont, NULL, &cft_release_agent)) < 0)
+                       return err;
+       }
+
        for_each_subsys(cont->root, ss) {
                if (ss->populate && (err = ss->populate(ss, cont)) < 0)
                        return err;
@@ -1779,6 +1986,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
        INIT_LIST_HEAD(&cont->sibling);
        INIT_LIST_HEAD(&cont->children);
        INIT_LIST_HEAD(&cont->css_sets);
+       INIT_LIST_HEAD(&cont->release_list);
 
        cont->parent = parent;
        cont->root = parent->root;
@@ -1840,6 +2048,38 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode)
        return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static inline int cgroup_has_css_refs(struct cgroup *cont)
+{
+       /* Check the reference count on each subsystem. Since we
+        * already established that there are no tasks in the
+        * cgroup, if the css refcount is also 0, then there should
+        * be no outstanding references, so the subsystem is safe to
+        * destroy. We scan across all subsystems rather than using
+        * the per-hierarchy linked list of mounted subsystems since
+        * we can be called via check_for_release() with no
+        * synchronization other than RCU, and the subsystem linked
+        * list isn't RCU-safe */
+       int i;
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               struct cgroup_subsys_state *css;
+               /* Skip subsystems not in this hierarchy */
+               if (ss->root != cont->root)
+                       continue;
+               css = cont->subsys[ss->subsys_id];
+               /* When called from check_for_release() it's possible
+                * that by this point the cgroup has been removed
+                * and the css deleted. But a false-positive doesn't
+                * matter, since it can only happen if the cgroup
+                * has been deleted and hence no longer needs the
+                * release agent to be called anyway. */
+               if (css && atomic_read(&css->refcnt)) {
+                       return 1;
+               }
+       }
+       return 0;
+}
+
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cgroup *cont = dentry->d_fsdata;
@@ -1848,7 +2088,6 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        struct cgroup_subsys *ss;
        struct super_block *sb;
        struct cgroupfs_root *root;
-       int css_busy = 0;
 
        /* the vfs holds both inode->i_mutex already */
 
@@ -1866,20 +2105,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        root = cont->root;
        sb = root->sb;
 
-       /* Check the reference count on each subsystem. Since we
-        * already established that there are no tasks in the
-        * cgroup, if the css refcount is also 0, then there should
-        * be no outstanding references, so the subsystem is safe to
-        * destroy */
-       for_each_subsys(root, ss) {
-               struct cgroup_subsys_state *css;
-               css = cont->subsys[ss->subsys_id];
-               if (atomic_read(&css->refcnt)) {
-                       css_busy = 1;
-                       break;
-               }
-       }
-       if (css_busy) {
+       if (cgroup_has_css_refs(cont)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -1889,7 +2115,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
                        ss->destroy(ss, cont);
        }
 
+       spin_lock(&release_list_lock);
        set_bit(CONT_REMOVED, &cont->flags);
+       if (!list_empty(&cont->release_list))
+               list_del(&cont->release_list);
+       spin_unlock(&release_list_lock);
        /* delete my sibling from parent->children */
        list_del(&cont->sibling);
        spin_lock(&cont->dentry->d_lock);
@@ -1901,6 +2131,9 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        dput(d);
        root->number_of_cgroups--;
 
+       set_bit(CONT_RELEASABLE, &parent->flags);
+       check_for_release(parent);
+
        mutex_unlock(&cgroup_mutex);
        /* Drop the active superblock reference that we took when we
         * created the cgroup */
@@ -1938,15 +2171,15 @@ static void cgroup_init_subsys(struct cgroup_subsys *ss)
        /* If this subsystem requested that it be notified with fork
         * events, we should send it one now for every process in the
         * system */
-       if (ss->fork) {
-               struct task_struct *g, *p;
-
-               read_lock(&tasklist_lock);
-               do_each_thread(g, p) {
-                       ss->fork(ss, p);
-               } while_each_thread(g, p);
-               read_unlock(&tasklist_lock);
-       }
+       if (ss->fork) {
+               struct task_struct *g, *p;
+
+               read_lock(&tasklist_lock);
+               do_each_thread(g, p) {
+                       ss->fork(ss, p);
+               } while_each_thread(g, p);
+               read_unlock(&tasklist_lock);
+       }
 
        need_forkexit_callback |= ss->fork || ss->exit;
 
@@ -2263,7 +2496,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
        tsk->cgroups = &init_css_set;
        task_unlock(tsk);
        if (cg)
-               put_css_set(cg);
+               put_css_set_taskexit(cg);
 }
 
 /**
@@ -2374,7 +2607,10 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys)
 
  out_release:
        mutex_unlock(&inode->i_mutex);
+
+       mutex_lock(&cgroup_mutex);
        put_css_set(cg);
+       mutex_unlock(&cgroup_mutex);
        deactivate_super(parent->root->sb);
        return ret;
 }
@@ -2404,3 +2640,111 @@ int cgroup_is_descendant(const struct cgroup *cont)
        ret = (cont == target);
        return ret;
 }
+
+static void check_for_release(struct cgroup *cont)
+{
+       /* All of these checks rely on RCU to keep the cgroup
+        * structure alive */
+       if (cgroup_is_releasable(cont) && !atomic_read(&cont->count)
+           && list_empty(&cont->children) && !cgroup_has_css_refs(cont)) {
+               /* Control Group is currently removeable. If it's not
+                * already queued for a userspace notification, queue
+                * it now */
+               int need_schedule_work = 0;
+               spin_lock(&release_list_lock);
+               if (!cgroup_is_removed(cont) &&
+                   list_empty(&cont->release_list)) {
+                       list_add(&cont->release_list, &release_list);
+                       need_schedule_work = 1;
+               }
+               spin_unlock(&release_list_lock);
+               if (need_schedule_work)
+                       schedule_work(&release_agent_work);
+       }
+}
+
+void __css_put(struct cgroup_subsys_state *css)
+{
+       struct cgroup *cont = css->cgroup;
+       rcu_read_lock();
+       if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cont)) {
+               set_bit(CONT_RELEASABLE, &cont->flags);
+               check_for_release(cont);
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * Notify userspace when a cgroup is released, by running the
+ * configured release agent with the name of the cgroup (path
+ * relative to the root of cgroup file system) as the argument.
+ *
+ * Most likely, this user command will try to rmdir this cgroup.
+ *
+ * This races with the possibility that some other task will be
+ * attached to this cgroup before it is removed, or that some other
+ * user task will 'mkdir' a child cgroup of this cgroup.  That's ok.
+ * The presumed 'rmdir' will fail quietly if this cgroup is no longer
+ * unused, and this cgroup will be reprieved from its death sentence,
+ * to continue to serve a useful existence.  Next time it's released,
+ * we will get notified again, if it still has 'notify_on_release' set.
+ *
+ * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which
+ * means only wait until the task is successfully execve()'d.  The
+ * separate release agent task is forked by call_usermodehelper(),
+ * then control in this thread returns here, without waiting for the
+ * release agent task.  We don't bother to wait because the caller of
+ * this routine has no use for the exit status of the release agent
+ * task, so no sense holding our caller up for that.
+ *
+ */
+
+static void cgroup_release_agent(struct work_struct *work)
+{
+       BUG_ON(work != &release_agent_work);
+       mutex_lock(&cgroup_mutex);
+       spin_lock(&release_list_lock);
+       while (!list_empty(&release_list)) {
+               char *argv[3], *envp[3];
+               int i;
+               char *pathbuf;
+               struct cgroup *cont = list_entry(release_list.next,
+                                                   struct cgroup,
+                                                   release_list);
+               list_del_init(&cont->release_list);
+               spin_unlock(&release_list_lock);
+               pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL);
+               if (!pathbuf) {
+                       spin_lock(&release_list_lock);
+                       continue;
+               }
+
+               if (cgroup_path(cont, pathbuf, PAGE_SIZE) < 0) {
+                       kfree(pathbuf);
+                       spin_lock(&release_list_lock);
+                       continue;
+               }
+
+               i = 0;
+               argv[i++] = cont->root->release_agent_path;
+               argv[i++] = (char *)pathbuf;
+               argv[i] = NULL;
+
+               i = 0;
+               /* minimal command environment */
+               envp[i++] = "HOME=/";
+               envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
+               envp[i] = NULL;
+
+               /* Drop the lock while we invoke the usermode helper,
+                * since the exec could involve hitting disk and hence
+                * be a slow process */
+               mutex_unlock(&cgroup_mutex);
+               call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+               kfree(pathbuf);
+               mutex_lock(&cgroup_mutex);
+               spin_lock(&release_list_lock);
+       }
+       spin_unlock(&release_list_lock);
+       mutex_unlock(&cgroup_mutex);
+}