cgroup: Merge branch 'memcg_event' into for-3.14

author Tejun Heo <tj@kernel.org>

Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)

committer Tejun Heo <tj@kernel.org>

Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
author Tejun Heo <tj@kernel.org>
Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
committer Tejun Heo <tj@kernel.org>
Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
diff --combined include/linux/cgroup.h

index 39c1d94,8d9fa89..492fa01
--- 1/include/linux/cgroup.h
--- 2/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@@ -29,7 -29,6 +29,6 @@@ struct cgroup_subsys
   struct inode;
   struct cgroup;
   struct css_id;
- struct eventfd_ctx;
   
   extern int cgroup_init_early(void);
   extern int cgroup_init(void);
@@@ -239,10 -238,6 +238,6 @@@ struct cgroup 
         struct rcu_head rcu_head;
         struct work_struct destroy_work;
   
-       /* List of events which userspace want to receive */
-       struct list_head event_list;
-       spinlock_t event_list_lock;
- 
         /* directory xattrs */
         struct simple_xattrs xattrs;
   };
@@@ -506,25 -501,6 +501,6 @@@ struct cftype 
         int (*trigger)(struct cgroup_subsys_state *css, unsigned int event);
   
         int (*release)(struct inode *inode, struct file *file);
- 
-       /*
-        * register_event() callback will be used to add new userspace
-        * waiter for changes related to the cftype. Implement it if
-        * you want to provide this functionality. Use eventfd_signal()
-        * on eventfd to send notification to userspace.
-        */
-       int (*register_event)(struct cgroup_subsys_state *css,
-                             struct cftype *cft, struct eventfd_ctx *eventfd,
-                             const char *args);
-       /*
-        * unregister_event() callback will be called when userspace
-        * closes the eventfd or on cgroup removing.
-        * This callback must be implemented, if you want provide
-        * notification functionality.
-        */
-       void (*unregister_event)(struct cgroup_subsys_state *css,
-                                struct cftype *cft,
-                                struct eventfd_ctx *eventfd);
   };
   
   /*
@@@ -612,6 -588,11 +588,6 @@@ struct cgroup_subsys 
         int subsys_id;
         int disabled;
         int early_init;
- -      /*
- -       * True if this subsys uses ID. ID is not available before cgroup_init()
- -       * (not available in early_init time.)
- -       */
- -      bool use_id;
   
         /*
          * If %false, this subsystem is properly hierarchical -
@@@ -637,6 -618,9 +613,6 @@@
          */
         struct cgroupfs_root *root;
         struct list_head sibling;
- -      /* used when use_id == true */
- -      struct idr idr;
- -      spinlock_t id_lock;
   
         /* list of cftype_sets */
         struct list_head cftsets;
@@@ -867,6 -851,35 +843,6 @@@ int css_scan_tasks(struct cgroup_subsys
   int cgroup_attach_task_all(struct task_struct *from, struct task_struct *);
   int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from);
   
- -/*
- - * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
- - * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
- - * CSS ID is assigned at cgroup allocation (create) automatically
- - * and removed when subsys calls free_css_id() function. This is because
- - * the lifetime of cgroup_subsys_state is subsys's matter.
- - *
- - * Looking up and scanning function should be called under rcu_read_lock().
- - * Taking cgroup_mutex is not necessary for following calls.
- - * But the css returned by this routine can be "not populated yet" or "being
- - * destroyed". The caller should check css and cgroup's status.
- - */
- -
- -/*
- - * Typically Called at ->destroy(), or somewhere the subsys frees
- - * cgroup_subsys_state.
- - */
- -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
- -
- -/* Find a cgroup_subsys_state which has given ID */
- -
- -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
- -
- -/* Returns true if root is ancestor of cg */
- -bool css_is_ancestor(struct cgroup_subsys_state *cg,
- -                   const struct cgroup_subsys_state *root);
- -
- -/* Get id and depth of css */
- -unsigned short css_id(struct cgroup_subsys_state *css);
   struct cgroup_subsys_state *css_from_dir(struct dentry *dentry,
                                          struct cgroup_subsys *ss);
   
diff --combined init/Kconfig

index 79383d3,3ca5b81..93f3443
--- 1/init/Kconfig
--- 2/init/Kconfig
+++ b/init/Kconfig
@@@ -284,7 -284,7 +284,7 @@@ config AUDI
   
   config AUDITSYSCALL
         bool "Enable system-call auditing support"
- -      depends on AUDIT && (X86 || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
+ +      depends on AUDIT && (X86 || PARISC || PPC || S390 || IA64 || UML || SPARC64 || SUPERH || (ARM && AEABI && !OABI_COMPAT))
         default y if SECURITY_SELINUX
         help
           Enable low-overhead system-call auditing infrastructure that
@@@ -301,6 -301,20 +301,6 @@@ config AUDIT_TRE
         depends on AUDITSYSCALL
         select FSNOTIFY
   
- -config AUDIT_LOGINUID_IMMUTABLE
- -      bool "Make audit loginuid immutable"
- -      depends on AUDIT
- -      help
- -        The config option toggles if a task setting its loginuid requires
- -        CAP_SYS_AUDITCONTROL or if that task should require no special permissions
- -        but should instead only allow setting its loginuid if it was never
- -        previously set.  On systems which use systemd or a similar central
- -        process to restart login services this should be set to true.  On older
- -        systems in which an admin would typically have to directly stop and
- -        start processes this should be set to false.  Setting this to true allows
- -        one to drop potentially dangerous capabilites from the login tasks,
- -        but may not be backwards compatible with older init systems.
- -
   source "kernel/irq/Kconfig"
   source "kernel/time/Kconfig"
   
@@@ -340,8 -354,7 +340,8 @@@ config VIRT_CPU_ACCOUNTING_NATIV
   
   config VIRT_CPU_ACCOUNTING_GEN
         bool "Full dynticks CPU time accounting"
- -      depends on HAVE_CONTEXT_TRACKING && 64BIT
+ +      depends on HAVE_CONTEXT_TRACKING
+ +      depends on HAVE_VIRT_CPU_ACCOUNTING_GEN
         select VIRT_CPU_ACCOUNTING
         select CONTEXT_TRACKING
         help
@@@ -831,7 -844,7 +831,7 @@@ config NUMA_BALANCING_DEFAULT_ENABLE
         default y
         depends on NUMA_BALANCING
         help
- -        If set, autonumic NUMA balancing will be enabled if running on a NUMA
+ +        If set, automatic NUMA balancing will be enabled if running on a NUMA
           machine.
   
   config NUMA_BALANCING
@@@ -842,13 -855,12 +842,12 @@@
         help
           This option adds support for automatic NUMA aware memory/task placement.
           The mechanism is quite primitive and is based on migrating memory when
- -        it is references to the node the task is running on.
+ +        it has references to the node the task is running on.
   
           This system will be inactive on UMA systems.
   
   menuconfig CGROUPS
         boolean "Control Group support"
-       depends on EVENTFD
         help
           This option adds support for grouping sets of processes together, for
           use with process control subsystems such as Cpusets, CFS, memory
@@@ -915,6 -927,7 +914,7 @@@ config MEMC
         bool "Memory Resource Controller for Control Groups"
         depends on RESOURCE_COUNTERS
         select MM_OWNER
+       select EVENTFD
         help
           Provides a memory resource controller that manages both anonymous
           memory and page cache. (See Documentation/cgroups/memory.txt)
@@@ -1154,7 -1167,6 +1154,6 @@@ config UIDGID_STRICT_TYPE_CHECK
   
   config SCHED_AUTOGROUP
         bool "Automatic process group scheduling"
-       select EVENTFD
         select CGROUPS
         select CGROUP_SCHED
         select FAIR_GROUP_SCHED
@@@ -1655,18 -1667,6 +1654,18 @@@ config BASE_SMAL
         default 0 if BASE_FULL
         default 1 if !BASE_FULL
   
+ +config SYSTEM_TRUSTED_KEYRING
+ +      bool "Provide system-wide ring of trusted keys"
+ +      depends on KEYS
+ +      help
+ +        Provide a system keyring to which trusted keys can be added.  Keys in
+ +        the keyring are considered to be trusted.  Keys may be added at will
+ +        by the kernel from compiled-in data and from hardware key stores, but
+ +        userspace may only add extra keys if those keys can be verified by
+ +        keys already in the keyring.
+ +
+ +        Keys in this keyring are used by module signature checking.
+ +
   menuconfig MODULES
         bool "Enable loadable module support"
         option modules
@@@ -1740,7 -1740,6 +1739,7 @@@ config MODULE_SRCVERSION_AL
   config MODULE_SIG
         bool "Module signature verification"
         depends on MODULES
+ +      select SYSTEM_TRUSTED_KEYRING
         select KEYS
         select CRYPTO
         select ASYMMETRIC_KEY_TYPE
diff --combined kernel/cgroup.c

index a7b98ee,c0248e1..be42967
--- 1/kernel/cgroup.c
--- 2/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@@ -56,11 -56,8 +56,8 @@@
   #include <linux/pid_namespace.h>
   #include <linux/idr.h>
   #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
- #include <linux/eventfd.h>
- #include <linux/poll.h>
   #include <linux/flex_array.h> /* used in cgroup_attach_task */
   #include <linux/kthread.h>
- #include <linux/file.h>
   
   #include <linux/atomic.h>
   
@@@ -90,14 -87,6 +87,14 @@@ static DEFINE_MUTEX(cgroup_mutex)
   static DEFINE_MUTEX(cgroup_root_mutex);
   
   /*
+ + * cgroup destruction makes heavy use of work items and there can be a lot
+ + * of concurrent destructions.  Use a separate workqueue so that cgroup
+ + * destruction work items don't end up filling up max_active of system_wq
+ + * which may lead to deadlock.
+ + */
+ +static struct workqueue_struct *cgroup_destroy_wq;
+ +
+ +/*
    * Generate an array of cgroup subsystem pointers. At boot time, this is
    * populated with the built in subsystems, and modular subsystems are
    * registered after that. The mutable section of this array is protected by
@@@ -132,36 -121,38 +129,6 @@@ struct cfent 
         struct simple_xattrs            xattrs;
   };
   
--/*
-  * cgroup_event represents events which userspace want to receive.
- - * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
- - * cgroup_subsys->use_id != 0.
-- */
- struct cgroup_event {
- -#define CSS_ID_MAX    (65535)
- -struct css_id {
--      /*
-        * css which the event belongs to.
- -       * The css to which this ID points. This pointer is set to valid value
- -       * after cgroup is populated. If cgroup is removed, this will be NULL.
- -       * This pointer is expected to be RCU-safe because destroy()
- -       * is called after synchronize_rcu(). But for safe use, css_tryget()
- -       * should be used for avoiding race.
--       */
-       struct cgroup_subsys_state *css;
- -      struct cgroup_subsys_state __rcu *css;
--      /*
-        * Control file which the event associated.
- -       * ID of this css.
--       */
-       struct cftype *cft;
- -      unsigned short id;
--      /*
-        * eventfd to signal userspace about the event.
- -       * Depth in hierarchy which this ID belongs to.
--       */
-       struct eventfd_ctx *eventfd;
- -      unsigned short depth;
--      /*
-        * Each of these stored in a list by the cgroup.
- -       * ID is freed by RCU. (and lookup routine is RCU safe.)
--       */
-       struct list_head list;
- -      struct rcu_head rcu_head;
--      /*
-        * All fields below needed to unregister event when
-        * userspace closes eventfd.
- -       * Hierarchy of CSS ID belongs to.
--       */
-       poll_table pt;
-       wait_queue_head_t *wqh;
-       wait_queue_t wait;
-       struct work_struct remove;
- -      unsigned short stack[0]; /* Array of Length (depth+1) */
--};
--
   /* The list of hierarchy roots */
   
   static LIST_HEAD(cgroup_roots);
@@@ -363,6 -354,9 +330,6 @@@ struct cgrp_cset_link 
   static struct css_set init_css_set;
   static struct cgrp_cset_link init_cgrp_cset_link;
   
- -static int cgroup_init_idr(struct cgroup_subsys *ss,
- -                         struct cgroup_subsys_state *css);
- -
   /*
    * css_set_lock protects the list of css_set objects, and the chain of
    * tasks off each css_set.  Nests outside task->alloc_lock due to
@@@ -814,6 -808,8 +781,6 @@@ static struct backing_dev_info cgroup_b
         .capabilities   = BDI_CAP_NO_ACCT_AND_WRITEBACK,
   };
   
- -static int alloc_css_id(struct cgroup_subsys_state *child_css);
- -
   static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb)
   {
         struct inode *inode = new_inode(sb);
@@@ -879,7 -875,7 +846,7 @@@ static void cgroup_free_rcu(struct rcu_
         struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
   
         INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
- -      schedule_work(&cgrp->destroy_work);
+ +      queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
   }
   
   static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@@ -903,6 -899,11 +870,6 @@@
         iput(inode);
   }
   
- -static int cgroup_delete(const struct dentry *d)
- -{
- -      return 1;
- -}
- -
   static void remove_dir(struct dentry *d)
   {
         struct dentry *parent = dget(d->d_parent);
@@@ -1351,8 -1352,6 +1318,6 @@@ static void init_cgroup_housekeeping(st
         INIT_LIST_HEAD(&cgrp->pidlists);
         mutex_init(&cgrp->pidlist_mutex);
         cgrp->dummy_css.cgroup = cgrp;
-       INIT_LIST_HEAD(&cgrp->event_list);
-       spin_lock_init(&cgrp->event_list_lock);
         simple_xattrs_init(&cgrp->xattrs);
   }
   
@@@ -1489,7 -1488,7 +1454,7 @@@ static int cgroup_get_rootdir(struct su
   {
         static const struct dentry_operations cgroup_dops = {
                 .d_iput = cgroup_diput,
- -              .d_delete = cgroup_delete,
+ +              .d_delete = always_delete_dentry,
         };
   
         struct inode *inode =
@@@ -2626,16 -2625,6 +2591,6 @@@ static const struct inode_operations cg
         .removexattr = cgroup_removexattr,
   };
   
- /*
-  * Check if a file is a control file
-  */
- static inline struct cftype *__file_cft(struct file *file)
- {
-       if (file_inode(file)->i_fop != &cgroup_file_operations)
-               return ERR_PTR(-EINVAL);
-       return __d_cft(file->f_dentry);
- }
- 
   static int cgroup_create_file(struct dentry *dentry, umode_t mode,
                                 struct super_block *sb)
   {
@@@ -3915,202 -3904,6 +3870,6 @@@ static void cgroup_dput(struct cgroup *
         deactivate_super(sb);
   }
   
- /*
-  * Unregister event and free resources.
-  *
-  * Gets called from workqueue.
-  */
- static void cgroup_event_remove(struct work_struct *work)
- {
-       struct cgroup_event *event = container_of(work, struct cgroup_event,
-                       remove);
-       struct cgroup_subsys_state *css = event->css;
- 
-       remove_wait_queue(event->wqh, &event->wait);
- 
-       event->cft->unregister_event(css, event->cft, event->eventfd);
- 
-       /* Notify userspace the event is going away. */
-       eventfd_signal(event->eventfd, 1);
- 
-       eventfd_ctx_put(event->eventfd);
-       kfree(event);
-       css_put(css);
- }
- 
- /*
-  * Gets called on POLLHUP on eventfd when user closes it.
-  *
-  * Called with wqh->lock held and interrupts disabled.
-  */
- static int cgroup_event_wake(wait_queue_t *wait, unsigned mode,
-               int sync, void *key)
- {
-       struct cgroup_event *event = container_of(wait,
-                       struct cgroup_event, wait);
-       struct cgroup *cgrp = event->css->cgroup;
-       unsigned long flags = (unsigned long)key;
- 
-       if (flags & POLLHUP) {
-               /*
-                * If the event has been detached at cgroup removal, we
-                * can simply return knowing the other side will cleanup
-                * for us.
-                *
-                * We can't race against event freeing since the other
-                * side will require wqh->lock via remove_wait_queue(),
-                * which we hold.
-                */
-               spin_lock(&cgrp->event_list_lock);
-               if (!list_empty(&event->list)) {
-                       list_del_init(&event->list);
-                       /*
-                        * We are in atomic context, but cgroup_event_remove()
-                        * may sleep, so we have to call it in workqueue.
-                        */
-                       schedule_work(&event->remove);
-               }
-               spin_unlock(&cgrp->event_list_lock);
-       }
- 
-       return 0;
- }
- 
- static void cgroup_event_ptable_queue_proc(struct file *file,
-               wait_queue_head_t *wqh, poll_table *pt)
- {
-       struct cgroup_event *event = container_of(pt,
-                       struct cgroup_event, pt);
- 
-       event->wqh = wqh;
-       add_wait_queue(wqh, &event->wait);
- }
- 
- /*
-  * Parse input and register new cgroup event handler.
-  *
-  * Input must be in format '<event_fd> <control_fd> <args>'.
-  * Interpretation of args is defined by control file implementation.
-  */
- static int cgroup_write_event_control(struct cgroup_subsys_state *dummy_css,
-                                     struct cftype *cft, const char *buffer)
- {
-       struct cgroup *cgrp = dummy_css->cgroup;
-       struct cgroup_event *event;
-       struct cgroup_subsys_state *cfile_css;
-       unsigned int efd, cfd;
-       struct fd efile;
-       struct fd cfile;
-       char *endp;
-       int ret;
- 
-       efd = simple_strtoul(buffer, &endp, 10);
-       if (*endp != ' ')
-               return -EINVAL;
-       buffer = endp + 1;
- 
-       cfd = simple_strtoul(buffer, &endp, 10);
-       if ((*endp != ' ') && (*endp != '\0'))
-               return -EINVAL;
-       buffer = endp + 1;
- 
-       event = kzalloc(sizeof(*event), GFP_KERNEL);
-       if (!event)
-               return -ENOMEM;
- 
-       INIT_LIST_HEAD(&event->list);
-       init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc);
-       init_waitqueue_func_entry(&event->wait, cgroup_event_wake);
-       INIT_WORK(&event->remove, cgroup_event_remove);
- 
-       efile = fdget(efd);
-       if (!efile.file) {
-               ret = -EBADF;
-               goto out_kfree;
-       }
- 
-       event->eventfd = eventfd_ctx_fileget(efile.file);
-       if (IS_ERR(event->eventfd)) {
-               ret = PTR_ERR(event->eventfd);
-               goto out_put_efile;
-       }
- 
-       cfile = fdget(cfd);
-       if (!cfile.file) {
-               ret = -EBADF;
-               goto out_put_eventfd;
-       }
- 
-       /* the process need read permission on control file */
-       /* AV: shouldn't we check that it's been opened for read instead? */
-       ret = inode_permission(file_inode(cfile.file), MAY_READ);
-       if (ret < 0)
-               goto out_put_cfile;
- 
-       event->cft = __file_cft(cfile.file);
-       if (IS_ERR(event->cft)) {
-               ret = PTR_ERR(event->cft);
-               goto out_put_cfile;
-       }
- 
-       if (!event->cft->ss) {
-               ret = -EBADF;
-               goto out_put_cfile;
-       }
- 
-       /*
-        * Determine the css of @cfile, verify it belongs to the same
-        * cgroup as cgroup.event_control, and associate @event with it.
-        * Remaining events are automatically removed on cgroup destruction
-        * but the removal is asynchronous, so take an extra ref.
-        */
-       rcu_read_lock();
- 
-       ret = -EINVAL;
-       event->css = cgroup_css(cgrp, event->cft->ss);
-       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent, event->cft->ss);
-       if (event->css && event->css == cfile_css && css_tryget(event->css))
-               ret = 0;
- 
-       rcu_read_unlock();
-       if (ret)
-               goto out_put_cfile;
- 
-       if (!event->cft->register_event || !event->cft->unregister_event) {
-               ret = -EINVAL;
-               goto out_put_css;
-       }
- 
-       ret = event->cft->register_event(event->css, event->cft,
-                       event->eventfd, buffer);
-       if (ret)
-               goto out_put_css;
- 
-       efile.file->f_op->poll(efile.file, &event->pt);
- 
-       spin_lock(&cgrp->event_list_lock);
-       list_add(&event->list, &cgrp->event_list);
-       spin_unlock(&cgrp->event_list_lock);
- 
-       fdput(cfile);
-       fdput(efile);
- 
-       return 0;
- 
- out_put_css:
-       css_put(event->css);
- out_put_cfile:
-       fdput(cfile);
- out_put_eventfd:
-       eventfd_ctx_put(event->eventfd);
- out_put_efile:
-       fdput(efile);
- out_kfree:
-       kfree(event);
- 
-       return ret;
- }
- 
   static u64 cgroup_clone_children_read(struct cgroup_subsys_state *css,
                                       struct cftype *cft)
   {
@@@ -4136,11 -3929,6 +3895,6 @@@ static struct cftype cgroup_base_files[
                 .mode = S_IRUGO | S_IWUSR,
         },
         {
-               .name = "cgroup.event_control",
-               .write_string = cgroup_write_event_control,
-               .mode = S_IWUGO,
-       },
-       {
                 .name = "cgroup.clone_children",
                 .flags = CFTYPE_INSANE,
                 .read_u64 = cgroup_clone_children_read,
@@@ -4206,6 -3994,21 +3960,6 @@@ static int cgroup_populate_dir(struct c
                                 goto err;
                 }
         }
- -
- -      /* This cgroup is ready now */
- -      for_each_root_subsys(cgrp->root, ss) {
- -              struct cgroup_subsys_state *css = cgroup_css(cgrp, ss);
- -              struct css_id *id = rcu_dereference_protected(css->id, true);
- -
- -              /*
- -               * Update id->css pointer and make this css visible from
- -               * CSS ID functions. This pointer will be dereferened
- -               * from RCU-read-side without locks.
- -               */
- -              if (id)
- -                      rcu_assign_pointer(id->css, css);
- -      }
- -
         return 0;
   err:
         cgroup_clear_dir(cgrp, subsys_mask);
@@@ -4257,7 -4060,7 +4011,7 @@@ static void css_free_rcu_fn(struct rcu_
          * css_put().  dput() requires process context which we don't have.
          */
         INIT_WORK(&css->destroy_work, css_free_work_fn);
- -      schedule_work(&css->destroy_work);
+ +      queue_work(cgroup_destroy_wq, &css->destroy_work);
   }
   
   static void css_release(struct percpu_ref *ref)
@@@ -4274,6 -4077,7 +4028,6 @@@ static void init_css(struct cgroup_subs
         css->cgroup = cgrp;
         css->ss = ss;
         css->flags = 0;
- -      css->id = NULL;
   
         if (cgrp->parent)
                 css->parent = cgroup_css(cgrp->parent, ss);
@@@ -4405,6 -4209,12 +4159,6 @@@ static long cgroup_create(struct cgrou
                         goto err_free_all;
   
                 init_css(css, ss, cgrp);
- -
- -              if (ss->use_id) {
- -                      err = alloc_css_id(css);
- -                      if (err)
- -                              goto err_free_all;
- -              }
         }
   
         /*
@@@ -4547,7 -4357,7 +4301,7 @@@ static void css_killed_ref_fn(struct pe
                 container_of(ref, struct cgroup_subsys_state, refcnt);
   
         INIT_WORK(&css->destroy_work, css_killed_work_fn);
- -      schedule_work(&css->destroy_work);
+ +      queue_work(cgroup_destroy_wq, &css->destroy_work);
   }
   
   /**
@@@ -4610,7 -4420,6 +4364,6 @@@ static int cgroup_destroy_locked(struc
         __releases(&cgroup_mutex) __acquires(&cgroup_mutex)
   {
         struct dentry *d = cgrp->dentry;
-       struct cgroup_event *event, *tmp;
         struct cgroup_subsys *ss;
         struct cgroup *child;
         bool empty;
@@@ -4685,18 -4494,6 +4438,6 @@@
         dget(d);
         cgroup_d_remove_dir(d);
   
-       /*
-        * Unregister events and notify userspace.
-        * Notify userspace about cgroup removing only after rmdir of cgroup
-        * directory to avoid race between userspace and kernelspace.
-        */
-       spin_lock(&cgrp->event_list_lock);
-       list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) {
-               list_del_init(&event->list);
-               schedule_work(&event->remove);
-       }
-       spin_unlock(&cgrp->event_list_lock);
- 
         return 0;
   };
   
@@@ -4869,6 -4666,12 +4610,6 @@@ int __init_or_module cgroup_load_subsys
   
         /* our new subsystem will be attached to the dummy hierarchy. */
         init_css(css, ss, cgroup_dummy_top);
- -      /* init_idr must be after init_css() because it sets css->id. */
- -      if (ss->use_id) {
- -              ret = cgroup_init_idr(ss, css);
- -              if (ret)
- -                      goto err_unload;
- -      }
   
         /*
          * Now we need to entangle the css into the existing css_sets. unlike
@@@ -4934,6 -4737,9 +4675,6 @@@ void cgroup_unload_subsys(struct cgroup
   
         offline_css(cgroup_css(cgroup_dummy_top, ss));
   
- -      if (ss->use_id)
- -              idr_destroy(&ss->idr);
- -
         /* deassign the subsys_id */
         cgroup_subsys[ss->subsys_id] = NULL;
   
@@@ -4960,7 -4766,8 +4701,7 @@@
         /*
          * remove subsystem's css from the cgroup_dummy_top and free it -
          * need to free before marking as null because ss->css_free needs
- -       * the cgrp->subsys pointer to find their state. note that this
- -       * also takes care of freeing the css_id.
+ +       * the cgrp->subsys pointer to find their state.
          */
         ss->css_free(cgroup_css(cgroup_dummy_top, ss));
         RCU_INIT_POINTER(cgroup_dummy_top->subsys[ss->subsys_id], NULL);
@@@ -5031,6 -4838,8 +4772,6 @@@ int __init cgroup_init(void
         for_each_builtin_subsys(ss, i) {
                 if (!ss->early_init)
                         cgroup_init_subsys(ss);
- -              if (ss->use_id)
- -                      cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]);
         }
   
         /* allocate id for the dummy hierarchy */
@@@ -5071,22 -4880,6 +4812,22 @@@ out
         return err;
   }
   
+ +static int __init cgroup_wq_init(void)
+ +{
+ +      /*
+ +       * There isn't much point in executing destruction path in
+ +       * parallel.  Good chunk is serialized with cgroup_mutex anyway.
+ +       * Use 1 for @max_active.
+ +       *
+ +       * We would prefer to do this in cgroup_init() above, but that
+ +       * is called before init_workqueues(): so leave this until after.
+ +       */
+ +      cgroup_destroy_wq = alloc_workqueue("cgroup_destroy", 0, 1);
+ +      BUG_ON(!cgroup_destroy_wq);
+ +      return 0;
+ +}
+ +core_initcall(cgroup_wq_init);
+ +
   /*
    * proc_cgroup_show()
    *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@@ -5466,6 -5259,181 +5207,6 @@@ static int __init cgroup_disable(char *
   }
   __setup("cgroup_disable=", cgroup_disable);
   
- -/*
- - * Functons for CSS ID.
- - */
- -
- -/* to get ID other than 0, this should be called when !cgroup_is_dead() */
- -unsigned short css_id(struct cgroup_subsys_state *css)
- -{
- -      struct css_id *cssid;
- -
- -      /*
- -       * This css_id() can return correct value when somone has refcnt
- -       * on this or this is under rcu_read_lock(). Once css->id is allocated,
- -       * it's unchanged until freed.
- -       */
- -      cssid = rcu_dereference_raw(css->id);
- -
- -      if (cssid)
- -              return cssid->id;
- -      return 0;
- -}
- -EXPORT_SYMBOL_GPL(css_id);
- -
- -/**
- - *  css_is_ancestor - test "root" css is an ancestor of "child"
- - * @child: the css to be tested.
- - * @root: the css supporsed to be an ancestor of the child.
- - *
- - * Returns true if "root" is an ancestor of "child" in its hierarchy. Because
- - * this function reads css->id, the caller must hold rcu_read_lock().
- - * But, considering usual usage, the csses should be valid objects after test.
- - * Assuming that the caller will do some action to the child if this returns
- - * returns true, the caller must take "child";s reference count.
- - * If "child" is valid object and this returns true, "root" is valid, too.
- - */
- -
- -bool css_is_ancestor(struct cgroup_subsys_state *child,
- -                  const struct cgroup_subsys_state *root)
- -{
- -      struct css_id *child_id;
- -      struct css_id *root_id;
- -
- -      child_id  = rcu_dereference(child->id);
- -      if (!child_id)
- -              return false;
- -      root_id = rcu_dereference(root->id);
- -      if (!root_id)
- -              return false;
- -      if (child_id->depth < root_id->depth)
- -              return false;
- -      if (child_id->stack[root_id->depth] != root_id->id)
- -              return false;
- -      return true;
- -}
- -
- -void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
- -{
- -      struct css_id *id = rcu_dereference_protected(css->id, true);
- -
- -      /* When this is called before css_id initialization, id can be NULL */
- -      if (!id)
- -              return;
- -
- -      BUG_ON(!ss->use_id);
- -
- -      rcu_assign_pointer(id->css, NULL);
- -      rcu_assign_pointer(css->id, NULL);
- -      spin_lock(&ss->id_lock);
- -      idr_remove(&ss->idr, id->id);
- -      spin_unlock(&ss->id_lock);
- -      kfree_rcu(id, rcu_head);
- -}
- -EXPORT_SYMBOL_GPL(free_css_id);
- -
- -/*
- - * This is called by init or create(). Then, calls to this function are
- - * always serialized (By cgroup_mutex() at create()).
- - */
- -
- -static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
- -{
- -      struct css_id *newid;
- -      int ret, size;
- -
- -      BUG_ON(!ss->use_id);
- -
- -      size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
- -      newid = kzalloc(size, GFP_KERNEL);
- -      if (!newid)
- -              return ERR_PTR(-ENOMEM);
- -
- -      idr_preload(GFP_KERNEL);
- -      spin_lock(&ss->id_lock);
- -      /* Don't use 0. allocates an ID of 1-65535 */
- -      ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT);
- -      spin_unlock(&ss->id_lock);
- -      idr_preload_end();
- -
- -      /* Returns error when there are no free spaces for new ID.*/
- -      if (ret < 0)
- -              goto err_out;
- -
- -      newid->id = ret;
- -      newid->depth = depth;
- -      return newid;
- -err_out:
- -      kfree(newid);
- -      return ERR_PTR(ret);
- -
- -}
- -
- -static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss,
- -                                          struct cgroup_subsys_state *rootcss)
- -{
- -      struct css_id *newid;
- -
- -      spin_lock_init(&ss->id_lock);
- -      idr_init(&ss->idr);
- -
- -      newid = get_new_cssid(ss, 0);
- -      if (IS_ERR(newid))
- -              return PTR_ERR(newid);
- -
- -      newid->stack[0] = newid->id;
- -      RCU_INIT_POINTER(newid->css, rootcss);
- -      RCU_INIT_POINTER(rootcss->id, newid);
- -      return 0;
- -}
- -
- -static int alloc_css_id(struct cgroup_subsys_state *child_css)
- -{
- -      struct cgroup_subsys_state *parent_css = css_parent(child_css);
- -      struct css_id *child_id, *parent_id;
- -      int i, depth;
- -
- -      parent_id = rcu_dereference_protected(parent_css->id, true);
- -      depth = parent_id->depth + 1;
- -
- -      child_id = get_new_cssid(child_css->ss, depth);
- -      if (IS_ERR(child_id))
- -              return PTR_ERR(child_id);
- -
- -      for (i = 0; i < depth; i++)
- -              child_id->stack[i] = parent_id->stack[i];
- -      child_id->stack[depth] = child_id->id;
- -      /*
- -       * child_id->css pointer will be set after this cgroup is available
- -       * see cgroup_populate_dir()
- -       */
- -      rcu_assign_pointer(child_css->id, child_id);
- -
- -      return 0;
- -}
- -
- -/**
- - * css_lookup - lookup css by id
- - * @ss: cgroup subsys to be looked into.
- - * @id: the id
- - *
- - * Returns pointer to cgroup_subsys_state if there is valid one with id.
- - * NULL if not. Should be called under rcu_read_lock()
- - */
- -struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
- -{
- -      struct css_id *cssid = NULL;
- -
- -      BUG_ON(!ss->use_id);
- -      cssid = idr_find(&ss->idr, id);
- -
- -      if (unlikely(!cssid))
- -              return NULL;
- -
- -      return rcu_dereference(cssid->css);
- -}
- -EXPORT_SYMBOL_GPL(css_lookup);
- -
   /**
    * css_from_dir - get corresponding css from the dentry of a cgroup dir
    * @dentry: directory dentry of interest
diff --combined mm/memcontrol.c

index f1a0ae6,ec8582b..7aa0d40
--- 1/mm/memcontrol.c
--- 2/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@@ -45,6 -45,7 +45,7 @@@
   #include <linux/swapops.h>
   #include <linux/spinlock.h>
   #include <linux/eventfd.h>
+ #include <linux/poll.h>
   #include <linux/sort.h>
   #include <linux/fs.h>
   #include <linux/seq_file.h>
@@@ -55,11 -56,11 +56,12 @@@
   #include <linux/cpu.h>
   #include <linux/oom.h>
   #include <linux/lockdep.h>
+ #include <linux/file.h>
   #include "internal.h"
   #include <net/sock.h>
   #include <net/ip.h>
   #include <net/tcp_memcontrol.h>
+ +#include "slab.h"
   
   #include <asm/uaccess.h>
   
@@@ -227,6 -228,46 +229,46 @@@ struct mem_cgroup_eventfd_list 
         struct eventfd_ctx *eventfd;
   };
   
+ /*
+  * cgroup_event represents events which userspace want to receive.
+  */
+ struct mem_cgroup_event {
+       /*
+        * memcg which the event belongs to.
+        */
+       struct mem_cgroup *memcg;
+       /*
+        * eventfd to signal userspace about the event.
+        */
+       struct eventfd_ctx *eventfd;
+       /*
+        * Each of these stored in a list by the cgroup.
+        */
+       struct list_head list;
+       /*
+        * register_event() callback will be used to add new userspace
+        * waiter for changes related to this event.  Use eventfd_signal()
+        * on eventfd to send notification to userspace.
+        */
+       int (*register_event)(struct mem_cgroup *memcg,
+                             struct eventfd_ctx *eventfd, const char *args);
+       /*
+        * unregister_event() callback will be called when userspace closes
+        * the eventfd or on cgroup removing.  This callback must be set,
+        * if you want provide notification functionality.
+        */
+       void (*unregister_event)(struct mem_cgroup *memcg,
+                                struct eventfd_ctx *eventfd);
+       /*
+        * All fields below needed to unregister event when
+        * userspace closes eventfd.
+        */
+       poll_table pt;
+       wait_queue_head_t *wqh;
+       wait_queue_t wait;
+       struct work_struct remove;
+ };
+ 
   static void mem_cgroup_threshold(struct mem_cgroup *memcg);
   static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
   
@@@ -313,7 -354,7 +355,7 @@@ struct mem_cgroup 
   
         atomic_t        dead_count;
   #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- -      struct tcp_memcontrol tcp_mem;
+ +      struct cg_proto tcp_mem;
   #endif
   #if defined(CONFIG_MEMCG_KMEM)
         /* analogous to slab_common's slab_caches list. per-memcg */
@@@ -331,6 -372,10 +373,10 @@@
         atomic_t        numainfo_updating;
   #endif
   
+       /* List of events which userspace want to receive */
+       struct list_head event_list;
+       spinlock_t event_list_lock;
+ 
         struct mem_cgroup_per_node *nodeinfo[0];
         /* WARNING: nodeinfo must be the last member here */
   };
@@@ -490,39 -535,11 +536,34 @@@ struct cgroup_subsys_state *vmpressure_
         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
   }
   
- struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css)
- {
-       return &mem_cgroup_from_css(css)->vmpressure;
- }
- 
   static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
   {
         return (memcg == root_mem_cgroup);
   }
   
+ +/*
+ + * We restrict the id in the range of [1, 65535], so it can fit into
+ + * an unsigned short.
+ + */
+ +#define MEM_CGROUP_ID_MAX     USHRT_MAX
+ +
+ +static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
+ +{
+ +      /*
+ +       * The ID of the root cgroup is 0, but memcg treat 0 as an
+ +       * invalid ID, so we return (cgroup_id + 1).
+ +       */
+ +      return memcg->css.cgroup->id + 1;
+ +}
+ +
+ +static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
+ +{
+ +      struct cgroup_subsys_state *css;
+ +
+ +      css = css_from_id(id - 1, &mem_cgroup_subsys);
+ +      return mem_cgroup_from_css(css);
+ +}
+ +
   /* Writing them here to avoid exposing memcg's inner layout */
   #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
   
@@@ -575,13 -592,13 +616,13 @@@ struct cg_proto *tcp_proto_cgroup(struc
         if (!memcg || mem_cgroup_is_root(memcg))
                 return NULL;
   
- -      return &memcg->tcp_mem.cg_proto;
+ +      return &memcg->tcp_mem;
   }
   EXPORT_SYMBOL(tcp_proto_cgroup);
   
   static void disarm_sock_keys(struct mem_cgroup *memcg)
   {
- -      if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+ +      if (!memcg_proto_activated(&memcg->tcp_mem))
                 return;
         static_key_slow_dec(&memcg_socket_limit_enabled);
   }
@@@ -594,11 -611,16 +635,11 @@@ static void disarm_sock_keys(struct mem
   #ifdef CONFIG_MEMCG_KMEM
   /*
    * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- - * There are two main reasons for not using the css_id for this:
- - *  1) this works better in sparse environments, where we have a lot of memcgs,
- - *     but only a few kmem-limited. Or also, if we have, for instance, 200
- - *     memcgs, and none but the 200th is kmem-limited, we'd have to have a
- - *     200 entry array for that.
- - *
- - *  2) In order not to violate the cgroup API, we would like to do all memory
- - *     allocation in ->create(). At that point, we haven't yet allocated the
- - *     css_id. Having a separate index prevents us from messing with the cgroup
- - *     core for this
+ + * The main reason for not using cgroup id for this:
+ + *  this works better in sparse environments, where we have a lot of memcgs,
+ + *  but only a few kmem-limited. Or also, if we have, for instance, 200
+ + *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ + *  200 entry array for that.
    *
    * The current size of the caches array is stored in
    * memcg_limited_groups_array_size.  It will double each time we have to
@@@ -613,14 -635,14 +654,14 @@@ int memcg_limited_groups_array_size
    * cgroups is a reasonable guess. In the future, it could be a parameter or
    * tunable, but that is strictly not necessary.
    *
- - * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
+ + * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
    * this constant directly from cgroup, but it is understandable that this is
    * better kept as an internal representation in cgroup.c. In any case, the
- - * css_id space is not getting any smaller, and we don't have to necessarily
+ + * cgrp_id space is not getting any smaller, and we don't have to necessarily
    * increase ours as well if it increases.
    */
   #define MEMCG_CACHES_MIN_SIZE 4
- -#define MEMCG_CACHES_MAX_SIZE 65535
+ +#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
   
   /*
    * A lot of the calls to the cache allocation functions are expected to be
@@@ -1427,7 -1449,7 +1468,7 @@@ bool __mem_cgroup_same_or_subtree(cons
                 return true;
         if (!root_memcg->use_hierarchy || !memcg)
                 return false;
- -      return css_is_ancestor(&memcg->css, &root_memcg->css);
+ +      return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
   }
   
   static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@@ -2845,10 -2867,15 +2886,10 @@@ static void __mem_cgroup_cancel_local_c
    */
   static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
   {
- -      struct cgroup_subsys_state *css;
- -
         /* ID 0 is unused ID */
         if (!id)
                 return NULL;
- -      css = css_lookup(&mem_cgroup_subsys, id);
- -      if (!css)
- -              return NULL;
- -      return mem_cgroup_from_css(css);
+ +      return mem_cgroup_from_id(id);
   }
   
   struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@@ -2969,7 -2996,7 +3010,7 @@@ static struct kmem_cache *memcg_params_
   
         VM_BUG_ON(p->is_root_cache);
         cachep = p->root_cache;
- -      return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
+ +      return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
   }
   
   #ifdef CONFIG_SLABINFO
@@@ -2998,14 -3025,21 +3039,14 @@@ static int memcg_charge_kmem(struct mem
         struct res_counter *fail_res;
         struct mem_cgroup *_memcg;
         int ret = 0;
- -      bool may_oom;
   
         ret = res_counter_charge(&memcg->kmem, size, &fail_res);
         if (ret)
                 return ret;
   
- -      /*
- -       * Conditions under which we can wait for the oom_killer. Those are
- -       * the same conditions tested by the core page allocator
- -       */
- -      may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
- -
         _memcg = memcg;
         ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- -                                    &_memcg, may_oom);
+ +                                    &_memcg, oom_gfp_allowed(gfp));
   
         if (ret == -EINTR)  {
                 /*
@@@ -3145,7 -3179,7 +3186,7 @@@ int memcg_update_cache_size(struct kmem
   {
         struct memcg_cache_params *cur_params = s->memcg_params;
   
- -      VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
+ +      VM_BUG_ON(!is_root_cache(s));
   
         if (num_groups > memcg_limited_groups_array_size) {
                 int i;
@@@ -3406,7 -3440,7 +3447,7 @@@ static struct kmem_cache *memcg_create_
         idx = memcg_cache_id(memcg);
   
         mutex_lock(&memcg_cache_mutex);
- -      new_cachep = cachep->memcg_params->memcg_caches[idx];
+ +      new_cachep = cache_from_memcg_idx(cachep, idx);
         if (new_cachep) {
                 css_put(&memcg->css);
                 goto out;
@@@ -3452,8 -3486,8 +3493,8 @@@ void kmem_cache_destroy_memcg_children(
          * we'll take the set_limit_mutex to protect ourselves against this.
          */
         mutex_lock(&set_limit_mutex);
- -      for (i = 0; i < memcg_limited_groups_array_size; i++) {
- -              c = s->memcg_params->memcg_caches[i];
+ +      for_each_memcg_cache_index(i) {
+ +              c = cache_from_memcg_idx(s, i);
                 if (!c)
                         continue;
   
@@@ -3586,8 -3620,8 +3627,8 @@@ struct kmem_cache *__memcg_kmem_get_cac
          * code updating memcg_caches will issue a write barrier to match this.
          */
         read_barrier_depends();
- -      if (likely(cachep->memcg_params->memcg_caches[idx])) {
- -              cachep = cachep->memcg_params->memcg_caches[idx];
+ +      if (likely(cache_from_memcg_idx(cachep, idx))) {
+ +              cachep = cache_from_memcg_idx(cachep, idx);
                 goto out;
         }
   
@@@ -4357,7 -4391,7 +4398,7 @@@ mem_cgroup_uncharge_swapcache(struct pa
          * css_get() was called in uncharge().
          */
         if (do_swap_account && swapout && memcg)
- -              swap_cgroup_record(ent, css_id(&memcg->css));
+ +              swap_cgroup_record(ent, mem_cgroup_id(memcg));
   }
   #endif
   
@@@ -4409,8 -4443,8 +4450,8 @@@ static int mem_cgroup_move_swap_account
   {
         unsigned short old_id, new_id;
   
- -      old_id = css_id(&from->css);
- -      new_id = css_id(&to->css);
+ +      old_id = mem_cgroup_id(from);
+ +      new_id = mem_cgroup_id(to);
   
         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
                 mem_cgroup_swap_statistics(from, false);
@@@ -5383,50 -5417,45 +5424,50 @@@ static int mem_cgroup_move_charge_write
   static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
                                 struct cftype *cft, struct seq_file *m)
   {
+ +      struct numa_stat {
+ +              const char *name;
+ +              unsigned int lru_mask;
+ +      };
+ +
+ +      static const struct numa_stat stats[] = {
+ +              { "total", LRU_ALL },
+ +              { "file", LRU_ALL_FILE },
+ +              { "anon", LRU_ALL_ANON },
+ +              { "unevictable", BIT(LRU_UNEVICTABLE) },
+ +      };
+ +      const struct numa_stat *stat;
         int nid;
- -      unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
- -      unsigned long node_nr;
+ +      unsigned long nr;
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
   
- -      total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
- -      seq_printf(m, "total=%lu", total_nr);
- -      for_each_node_state(nid, N_MEMORY) {
- -              node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
- -              seq_printf(m, " N%d=%lu", nid, node_nr);
- -      }
- -      seq_putc(m, '\n');
- -
- -      file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
- -      seq_printf(m, "file=%lu", file_nr);
- -      for_each_node_state(nid, N_MEMORY) {
- -              node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- -                              LRU_ALL_FILE);
- -              seq_printf(m, " N%d=%lu", nid, node_nr);
- -      }
- -      seq_putc(m, '\n');
- -
- -      anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
- -      seq_printf(m, "anon=%lu", anon_nr);
- -      for_each_node_state(nid, N_MEMORY) {
- -              node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- -                              LRU_ALL_ANON);
- -              seq_printf(m, " N%d=%lu", nid, node_nr);
+ +      for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ +              nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
+ +              seq_printf(m, "%s=%lu", stat->name, nr);
+ +              for_each_node_state(nid, N_MEMORY) {
+ +                      nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ +                                                        stat->lru_mask);
+ +                      seq_printf(m, " N%d=%lu", nid, nr);
+ +              }
+ +              seq_putc(m, '\n');
+ +      }
+ +
+ +      for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+ +              struct mem_cgroup *iter;
+ +
+ +              nr = 0;
+ +              for_each_mem_cgroup_tree(iter, memcg)
+ +                      nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
+ +              seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
+ +              for_each_node_state(nid, N_MEMORY) {
+ +                      nr = 0;
+ +                      for_each_mem_cgroup_tree(iter, memcg)
+ +                              nr += mem_cgroup_node_nr_lru_pages(
+ +                                      iter, nid, stat->lru_mask);
+ +                      seq_printf(m, " N%d=%lu", nid, nr);
+ +              }
+ +              seq_putc(m, '\n');
         }
- -      seq_putc(m, '\n');
   
- -      unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
- -      seq_printf(m, "unevictable=%lu", unevictable_nr);
- -      for_each_node_state(nid, N_MEMORY) {
- -              node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- -                              BIT(LRU_UNEVICTABLE));
- -              seq_printf(m, " N%d=%lu", nid, node_nr);
- -      }
- -      seq_putc(m, '\n');
         return 0;
   }
   #endif /* CONFIG_NUMA */
@@@ -5648,13 -5677,11 +5689,11 @@@ static void mem_cgroup_oom_notify(struc
                 mem_cgroup_oom_notify_cb(iter);
   }
   
- static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args, enum res_type type)
   {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 threshold, usage;
         int i, size, ret;
   
@@@ -5731,13 -5758,23 +5770,23 @@@ unlock
         return ret;
   }
   
- static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+ static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+ {
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
+ }
+ 
+ static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
+ {
+       return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
+ }
+ 
+ static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, enum res_type type)
   {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_thresholds *thresholds;
         struct mem_cgroup_threshold_ary *new;
-       enum res_type type = MEMFILE_TYPE(cft->private);
         u64 usage;
         int i, j, size;
   
@@@ -5810,14 -5847,23 +5859,23 @@@ unlock
         mutex_unlock(&memcg->thresholds_lock);
   }
   
- static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd, const char *args)
+ static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+ {
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
+ }
+ 
+ static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
+ {
+       return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
+ }
+ 
+ static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd, const char *args)
   {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *event;
-       enum res_type type = MEMFILE_TYPE(cft->private);
   
-       BUG_ON(type != _OOM_TYPE);
         event = kmalloc(sizeof(*event), GFP_KERNEL);
         if (!event)
                 return -ENOMEM;
@@@ -5835,14 -5881,10 +5893,10 @@@
         return 0;
   }
   
- static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css,
-       struct cftype *cft, struct eventfd_ctx *eventfd)
+ static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
+       struct eventfd_ctx *eventfd)
   {
-       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
         struct mem_cgroup_eventfd_list *ev, *tmp;
-       enum res_type type = MEMFILE_TYPE(cft->private);
- 
-       BUG_ON(type != _OOM_TYPE);
   
         spin_lock(&memcg_oom_lock);
   
@@@ -5959,13 -6001,233 +6013,233 @@@ static void kmem_cgroup_css_offline(str
   }
   #endif
   
+ /*
+  * DO NOT USE IN NEW FILES.
+  *
+  * "cgroup.event_control" implementation.
+  *
+  * This is way over-engineered.  It tries to support fully configurable
+  * events for each user.  Such level of flexibility is completely
+  * unnecessary especially in the light of the planned unified hierarchy.
+  *
+  * Please deprecate this and replace with something simpler if at all
+  * possible.
+  */
+ 
+ /*
+  * Unregister event and free resources.
+  *
+  * Gets called from workqueue.
+  */
+ static void memcg_event_remove(struct work_struct *work)
+ {
+       struct mem_cgroup_event *event =
+               container_of(work, struct mem_cgroup_event, remove);
+       struct mem_cgroup *memcg = event->memcg;
+ 
+       remove_wait_queue(event->wqh, &event->wait);
+ 
+       event->unregister_event(memcg, event->eventfd);
+ 
+       /* Notify userspace the event is going away. */
+       eventfd_signal(event->eventfd, 1);
+ 
+       eventfd_ctx_put(event->eventfd);
+       kfree(event);
+       css_put(&memcg->css);
+ }
+ 
+ /*
+  * Gets called on POLLHUP on eventfd when user closes it.
+  *
+  * Called with wqh->lock held and interrupts disabled.
+  */
+ static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
+                           int sync, void *key)
+ {
+       struct mem_cgroup_event *event =
+               container_of(wait, struct mem_cgroup_event, wait);
+       struct mem_cgroup *memcg = event->memcg;
+       unsigned long flags = (unsigned long)key;
+ 
+       if (flags & POLLHUP) {
+               /*
+                * If the event has been detached at cgroup removal, we
+                * can simply return knowing the other side will cleanup
+                * for us.
+                *
+                * We can't race against event freeing since the other
+                * side will require wqh->lock via remove_wait_queue(),
+                * which we hold.
+                */
+               spin_lock(&memcg->event_list_lock);
+               if (!list_empty(&event->list)) {
+                       list_del_init(&event->list);
+                       /*
+                        * We are in atomic context, but cgroup_event_remove()
+                        * may sleep, so we have to call it in workqueue.
+                        */
+                       schedule_work(&event->remove);
+               }
+               spin_unlock(&memcg->event_list_lock);
+       }
+ 
+       return 0;
+ }
+ 
+ static void memcg_event_ptable_queue_proc(struct file *file,
+               wait_queue_head_t *wqh, poll_table *pt)
+ {
+       struct mem_cgroup_event *event =
+               container_of(pt, struct mem_cgroup_event, pt);
+ 
+       event->wqh = wqh;
+       add_wait_queue(wqh, &event->wait);
+ }
+ 
+ /*
+  * DO NOT USE IN NEW FILES.
+  *
+  * Parse input and register new cgroup event handler.
+  *
+  * Input must be in format '<event_fd> <control_fd> <args>'.
+  * Interpretation of args is defined by control file implementation.
+  */
+ static int memcg_write_event_control(struct cgroup_subsys_state *css,
+                                    struct cftype *cft, const char *buffer)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event;
+       struct cgroup_subsys_state *cfile_css;
+       unsigned int efd, cfd;
+       struct fd efile;
+       struct fd cfile;
+       const char *name;
+       char *endp;
+       int ret;
+ 
+       efd = simple_strtoul(buffer, &endp, 10);
+       if (*endp != ' ')
+               return -EINVAL;
+       buffer = endp + 1;
+ 
+       cfd = simple_strtoul(buffer, &endp, 10);
+       if ((*endp != ' ') && (*endp != '\0'))
+               return -EINVAL;
+       buffer = endp + 1;
+ 
+       event = kzalloc(sizeof(*event), GFP_KERNEL);
+       if (!event)
+               return -ENOMEM;
+ 
+       event->memcg = memcg;
+       INIT_LIST_HEAD(&event->list);
+       init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
+       init_waitqueue_func_entry(&event->wait, memcg_event_wake);
+       INIT_WORK(&event->remove, memcg_event_remove);
+ 
+       efile = fdget(efd);
+       if (!efile.file) {
+               ret = -EBADF;
+               goto out_kfree;
+       }
+ 
+       event->eventfd = eventfd_ctx_fileget(efile.file);
+       if (IS_ERR(event->eventfd)) {
+               ret = PTR_ERR(event->eventfd);
+               goto out_put_efile;
+       }
+ 
+       cfile = fdget(cfd);
+       if (!cfile.file) {
+               ret = -EBADF;
+               goto out_put_eventfd;
+       }
+ 
+       /* the process need read permission on control file */
+       /* AV: shouldn't we check that it's been opened for read instead? */
+       ret = inode_permission(file_inode(cfile.file), MAY_READ);
+       if (ret < 0)
+               goto out_put_cfile;
+ 
+       /*
+        * Determine the event callbacks and set them in @event.  This used
+        * to be done via struct cftype but cgroup core no longer knows
+        * about these events.  The following is crude but the whole thing
+        * is for compatibility anyway.
+        *
+        * DO NOT ADD NEW FILES.
+        */
+       name = cfile.file->f_dentry->d_name.name;
+ 
+       if (!strcmp(name, "memory.usage_in_bytes")) {
+               event->register_event = mem_cgroup_usage_register_event;
+               event->unregister_event = mem_cgroup_usage_unregister_event;
+       } else if (!strcmp(name, "memory.oom_control")) {
+               event->register_event = mem_cgroup_oom_register_event;
+               event->unregister_event = mem_cgroup_oom_unregister_event;
+       } else if (!strcmp(name, "memory.pressure_level")) {
+               event->register_event = vmpressure_register_event;
+               event->unregister_event = vmpressure_unregister_event;
+       } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
+               event->register_event = memsw_cgroup_usage_register_event;
+               event->unregister_event = memsw_cgroup_usage_unregister_event;
+       } else {
+               ret = -EINVAL;
+               goto out_put_cfile;
+       }
+ 
+       /*
+        * Verify @cfile should belong to @css.  Also, remaining events are
+        * automatically removed on cgroup destruction but the removal is
+        * asynchronous, so take an extra ref on @css.
+        */
+       rcu_read_lock();
+ 
+       ret = -EINVAL;
+       cfile_css = css_from_dir(cfile.file->f_dentry->d_parent,
+                                &mem_cgroup_subsys);
+       if (cfile_css == css && css_tryget(css))
+               ret = 0;
+ 
+       rcu_read_unlock();
+       if (ret)
+               goto out_put_cfile;
+ 
+       ret = event->register_event(memcg, event->eventfd, buffer);
+       if (ret)
+               goto out_put_css;
+ 
+       efile.file->f_op->poll(efile.file, &event->pt);
+ 
+       spin_lock(&memcg->event_list_lock);
+       list_add(&event->list, &memcg->event_list);
+       spin_unlock(&memcg->event_list_lock);
+ 
+       fdput(cfile);
+       fdput(efile);
+ 
+       return 0;
+ 
+ out_put_css:
+       css_put(css);
+ out_put_cfile:
+       fdput(cfile);
+ out_put_eventfd:
+       eventfd_ctx_put(event->eventfd);
+ out_put_efile:
+       fdput(efile);
+ out_kfree:
+       kfree(event);
+ 
+       return ret;
+ }
+ 
   static struct cftype mem_cgroup_files[] = {
         {
                 .name = "usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "max_usage_in_bytes",
@@@ -6006,6 -6268,12 +6280,12 @@@
                 .read_u64 = mem_cgroup_hierarchy_read,
         },
         {
+               .name = "cgroup.event_control",         /* XXX: for compat */
+               .write_string = memcg_write_event_control,
+               .flags = CFTYPE_NO_PREFIX,
+               .mode = S_IWUGO,
+       },
+       {
                 .name = "swappiness",
                 .read_u64 = mem_cgroup_swappiness_read,
                 .write_u64 = mem_cgroup_swappiness_write,
@@@ -6019,14 -6287,10 +6299,10 @@@
                 .name = "oom_control",
                 .read_map = mem_cgroup_oom_control_read,
                 .write_u64 = mem_cgroup_oom_control_write,
-               .register_event = mem_cgroup_oom_register_event,
-               .unregister_event = mem_cgroup_oom_unregister_event,
                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
         },
         {
                 .name = "pressure_level",
-               .register_event = vmpressure_register_event,
-               .unregister_event = vmpressure_unregister_event,
         },
   #ifdef CONFIG_NUMA
         {
@@@ -6074,8 -6338,6 +6350,6 @@@ static struct cftype memsw_cgroup_files
                 .name = "memsw.usage_in_bytes",
                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
                 .read = mem_cgroup_read,
-               .register_event = mem_cgroup_usage_register_event,
-               .unregister_event = mem_cgroup_usage_unregister_event,
         },
         {
                 .name = "memsw.max_usage_in_bytes",
@@@ -6178,6 -6440,7 +6452,6 @@@ static void __mem_cgroup_free(struct me
         size_t size = memcg_size();
   
         mem_cgroup_remove_from_trees(memcg);
- -      free_css_id(&mem_cgroup_subsys, &memcg->css);
   
         for_each_node(node)
                 free_mem_cgroup_per_zone_info(memcg, node);
@@@ -6265,6 -6528,8 +6539,8 @@@ mem_cgroup_css_alloc(struct cgroup_subs
         mutex_init(&memcg->thresholds_lock);
         spin_lock_init(&memcg->move_lock);
         vmpressure_init(&memcg->vmpressure);
+       INIT_LIST_HEAD(&memcg->event_list);
+       spin_lock_init(&memcg->event_list_lock);
   
         return &memcg->css;
   
@@@ -6280,9 -6545,6 +6556,9 @@@ mem_cgroup_css_online(struct cgroup_sub
         struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
         int error = 0;
   
+ +      if (css->cgroup->id > MEM_CGROUP_ID_MAX)
+ +              return -ENOSPC;
+ +
         if (!parent)
                 return 0;
   
@@@ -6340,6 -6602,19 +6616,19 @@@ static void mem_cgroup_invalidate_recla
   static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
   {
         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
+       struct mem_cgroup_event *event, *tmp;
+ 
+       /*
+        * Unregister events and notify userspace.
+        * Notify userspace about cgroup removing only after rmdir of cgroup
+        * directory to avoid race between userspace and kernelspace.
+        */
+       spin_lock(&memcg->event_list_lock);
+       list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
+               list_del_init(&event->list);
+               schedule_work(&event->remove);
+       }
+       spin_unlock(&memcg->event_list_lock);
   
         kmem_cgroup_css_offline(memcg);
   
@@@ -6554,7 -6829,7 +6843,7 @@@ static enum mc_target_type get_mctgt_ty
         }
         /* There is a swap entry and a page doesn't exist or isn't charged */
         if (ent.val && !ret &&
- -                      css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
+ +          mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
                 ret = MC_TARGET_SWAP;
                 if (target)
                         target->ent = ent;
@@@ -6605,10 -6880,10 +6894,10 @@@ static int mem_cgroup_count_precharge_p
         pte_t *pte;
         spinlock_t *ptl;
   
- -      if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ +      if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
                         mc.precharge += HPAGE_PMD_NR;
- -              spin_unlock(&vma->vm_mm->page_table_lock);
+ +              spin_unlock(ptl);
                 return 0;
         }
   
@@@ -6797,9 -7072,9 +7086,9 @@@ static int mem_cgroup_move_charge_pte_r
          *    to be unlocked in __split_huge_page_splitting(), where the main
          *    part of thp split is not executed yet.
          */
- -      if (pmd_trans_huge_lock(pmd, vma) == 1) {
+ +      if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
                 if (mc.precharge < HPAGE_PMD_NR) {
- -                      spin_unlock(&vma->vm_mm->page_table_lock);
+ +                      spin_unlock(ptl);
                         return 0;
                 }
                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@@ -6816,7 -7091,7 +7105,7 @@@
                         }
                         put_page(page);
                 }
- -              spin_unlock(&vma->vm_mm->page_table_lock);
+ +              spin_unlock(ptl);
                 return 0;
         }
   
@@@ -6974,6 -7249,7 +7263,6 @@@ struct cgroup_subsys mem_cgroup_subsys 
         .bind = mem_cgroup_bind,
         .base_cftypes = mem_cgroup_files,
         .early_init = 0,
- -      .use_id = 1,
   };
   
   #ifdef CONFIG_MEMCG_SWAP
author	Tejun Heo <tj@kernel.org>
	Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
committer	Tejun Heo <tj@kernel.org>
	Fri, 22 Nov 2013 23:32:25 +0000 (18:32 -0500)
		1	2
include/linux/cgroup.h	patch \|	diff1 \|	diff2 \|	blob \| history
init/Kconfig	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/cgroup.c	patch \|	diff1 \|	diff2 \|	blob \| history
mm/memcontrol.c	patch \|	diff1 \|	diff2 \|	blob \| history