mm: charge active memcg when no mm is set
authorDan Schatzberg <schatzberg.dan@gmail.com>
Tue, 29 Jun 2021 02:38:18 +0000 (19:38 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 29 Jun 2021 17:53:50 +0000 (10:53 -0700)
set_active_memcg() worked for kernel allocations but was silently ignored
for user pages.

This patch establishes a precedence order for who gets charged:

1. If there is a memcg associated with the page already, that memcg is
   charged. This happens during swapin.

2. If an explicit mm is passed, mm->memcg is charged. This happens
   during page faults, which can be triggered in remote VMs (eg gup).

3. Otherwise consult the current process context. If there is an
   active_memcg, use that. Otherwise, current->mm->memcg.

Previously, if a NULL mm was passed to mem_cgroup_charge (case 3) it would
always charge the root cgroup.  Now it looks up the active_memcg first
(falling back to charging the root cgroup if not set).

Link: https://lkml.kernel.org/r/20210610173944.1203706-3-schatzberg.dan@gmail.com
Signed-off-by: Dan Schatzberg <schatzberg.dan@gmail.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Tejun Heo <tj@kernel.org>
Acked-by: Chris Down <chris@chrisdown.name>
Acked-by: Jens Axboe <axboe@kernel.dk>
Reviewed-by: Shakeel Butt <shakeelb@google.com>
Reviewed-by: Michal Koutný <mkoutny@suse.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Ming Lei <ming.lei@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
mm/filemap.c
mm/memcontrol.c
mm/shmem.c

index 66f7e9f..ac82a93 100644 (file)
@@ -872,7 +872,7 @@ noinline int __add_to_page_cache_locked(struct page *page,
        page->index = offset;
 
        if (!huge) {
-               error = mem_cgroup_charge(page, current->mm, gfp);
+               error = mem_cgroup_charge(page, NULL, gfp);
                if (error)
                        goto error;
                charged = true;
index f7a552e..8f3244e 100644 (file)
@@ -897,13 +897,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 }
 EXPORT_SYMBOL(mem_cgroup_from_task);
 
+static __always_inline struct mem_cgroup *active_memcg(void)
+{
+       if (in_interrupt())
+               return this_cpu_read(int_active_memcg);
+       else
+               return current->active_memcg;
+}
+
 /**
  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
  * @mm: mm from which memcg should be extracted. It can be NULL.
  *
- * Obtain a reference on mm->memcg and returns it if successful. Otherwise
- * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
- * returned.
+ * Obtain a reference on mm->memcg and returns it if successful. If mm
+ * is NULL, then the memcg is chosen as follows:
+ * 1) The active memcg, if set.
+ * 2) current->mm->memcg, if available
+ * 3) root memcg
+ * If mem_cgroup is disabled, NULL is returned.
  */
 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
@@ -921,8 +932,17 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
         * counting is disabled on the root level in the
         * cgroup core. See CSS_NO_REF.
         */
-       if (unlikely(!mm))
-               return root_mem_cgroup;
+       if (unlikely(!mm)) {
+               memcg = active_memcg();
+               if (unlikely(memcg)) {
+                       /* remote memcg must hold a ref */
+                       css_get(&memcg->css);
+                       return memcg;
+               }
+               mm = current->mm;
+               if (unlikely(!mm))
+                       return root_mem_cgroup;
+       }
 
        rcu_read_lock();
        do {
@@ -935,14 +955,6 @@ struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 }
 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
 
-static __always_inline struct mem_cgroup *active_memcg(void)
-{
-       if (in_interrupt())
-               return this_cpu_read(int_active_memcg);
-       else
-               return current->active_memcg;
-}
-
 static __always_inline bool memcg_kmem_bypass(void)
 {
        /* Allow remote memcg charging from any context. */
@@ -6711,7 +6723,8 @@ out:
  * @gfp_mask: reclaim mode
  *
  * Try to charge @page to the memcg that @mm belongs to, reclaiming
- * pages according to @gfp_mask if necessary.
+ * pages according to @gfp_mask if necessary. if @mm is NULL, try to
+ * charge to the active memcg.
  *
  * Do not use this for pages allocated for swapin.
  *
index 53f2101..e72931b 100644 (file)
@@ -1695,7 +1695,7 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
 {
        struct address_space *mapping = inode->i_mapping;
        struct shmem_inode_info *info = SHMEM_I(inode);
-       struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
+       struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL;
        struct swap_info_struct *si;
        struct page *page = NULL;
        swp_entry_t swap;
@@ -1828,7 +1828,7 @@ repeat:
        }
 
        sbinfo = SHMEM_SB(inode->i_sb);
-       charge_mm = vma ? vma->vm_mm : current->mm;
+       charge_mm = vma ? vma->vm_mm : NULL;
 
        page = pagecache_get_page(mapping, index,
                                        FGP_ENTRY | FGP_HEAD | FGP_LOCK, 0);