memcg: charge before adding to swapcache on swapin
authorShakeel Butt <shakeelb@google.com>
Fri, 30 Apr 2021 05:56:36 +0000 (22:56 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Fri, 30 Apr 2021 18:20:38 +0000 (11:20 -0700)
Currently the kernel adds the page, allocated for swapin, to the
swapcache before charging the page.  This is fine but now we want a
per-memcg swapcache stat which is essential for folks who wants to
transparently migrate from cgroup v1's memsw to cgroup v2's memory and
swap counters.  In addition charging a page before exposing it to other
parts of the kernel is a step in the right direction.

To correctly maintain the per-memcg swapcache stat, this patch has
adopted to charge the page before adding it to swapcache.  One challenge
in this option is the failure case of add_to_swap_cache() on which we
need to undo the mem_cgroup_charge().  Specifically undoing
mem_cgroup_uncharge_swap() is not simple.

To resolve the issue, this patch decouples the charging for swapin pages
from mem_cgroup_charge().  Two new functions are introduced,
mem_cgroup_swapin_charge_page() for just charging the swapin page and
mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the
page has been successfully added to the swapcache.

[shakeelb@google.com: set page->private before calling swap_readpage]
Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com
Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.com
Signed-off-by: Shakeel Butt <shakeelb@google.com>
Acked-by: Roman Gushchin <guro@fb.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Hugh Dickins <hughd@google.com>
Tested-by: Heiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
include/linux/memcontrol.h
mm/memcontrol.c
mm/memory.c
mm/swap_state.c

index 74910ce9a3f9fad96ab2de69dcbe2fd897ada219..e946c96daa32bc1ff7c19ce8dde82d3b7e7c7c1e 100644 (file)
@@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 }
 
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+                                 gfp_t gfp, swp_entry_t entry);
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
        return 0;
 }
 
+static inline int mem_cgroup_swapin_charge_page(struct page *page,
+                       struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+{
+       return 0;
+}
+
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
+}
+
 static inline void mem_cgroup_uncharge(struct page *page)
 {
 }
index 5582e1531b43e4a0fee59f0d783df1e8c503a0c2..6d0863db3e41fcc90eced432166e6f675c4fb7c7 100644 (file)
@@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
                        atomic_long_read(&parent->memory.children_low_usage)));
 }
 
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+                              gfp_t gfp)
+{
+       unsigned int nr_pages = thp_nr_pages(page);
+       int ret;
+
+       ret = try_charge(memcg, gfp, nr_pages);
+       if (ret)
+               goto out;
+
+       css_get(&memcg->css);
+       commit_charge(page, memcg);
+
+       local_irq_disable();
+       mem_cgroup_charge_statistics(memcg, page, nr_pages);
+       memcg_check_events(memcg, page);
+       local_irq_enable();
+out:
+       return ret;
+}
+
 /**
  * mem_cgroup_charge - charge a newly allocated page to a cgroup
  * @page: page to charge
@@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
  * Try to charge @page to the memcg that @mm belongs to, reclaiming
  * pages according to @gfp_mask if necessary.
  *
+ * Do not use this for pages allocated for swapin.
+ *
  * Returns 0 on success. Otherwise, an error code is returned.
  */
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
-       unsigned int nr_pages = thp_nr_pages(page);
-       struct mem_cgroup *memcg = NULL;
-       int ret = 0;
+       struct mem_cgroup *memcg;
+       int ret;
 
        if (mem_cgroup_disabled())
-               goto out;
+               return 0;
 
-       if (PageSwapCache(page)) {
-               swp_entry_t ent = { .val = page_private(page), };
-               unsigned short id;
+       memcg = get_mem_cgroup_from_mm(mm);
+       ret = __mem_cgroup_charge(page, memcg, gfp_mask);
+       css_put(&memcg->css);
 
-               /*
-                * Every swap fault against a single page tries to charge the
-                * page, bail as early as possible.  shmem_unuse() encounters
-                * already charged pages, too.  page and memcg binding is
-                * protected by the page lock, which serializes swap cache
-                * removal, which in turn serializes uncharging.
-                */
-               VM_BUG_ON_PAGE(!PageLocked(page), page);
-               if (page_memcg(compound_head(page)))
-                       goto out;
+       return ret;
+}
 
-               id = lookup_swap_cgroup_id(ent);
-               rcu_read_lock();
-               memcg = mem_cgroup_from_id(id);
-               if (memcg && !css_tryget_online(&memcg->css))
-                       memcg = NULL;
-               rcu_read_unlock();
-       }
+/**
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
+ */
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+                                 gfp_t gfp, swp_entry_t entry)
+{
+       struct mem_cgroup *memcg;
+       unsigned short id;
+       int ret;
 
-       if (!memcg)
+       if (mem_cgroup_disabled())
+               return 0;
+
+       id = lookup_swap_cgroup_id(entry);
+       rcu_read_lock();
+       memcg = mem_cgroup_from_id(id);
+       if (!memcg || !css_tryget_online(&memcg->css))
                memcg = get_mem_cgroup_from_mm(mm);
+       rcu_read_unlock();
 
-       ret = try_charge(memcg, gfp_mask, nr_pages);
-       if (ret)
-               goto out_put;
+       ret = __mem_cgroup_charge(page, memcg, gfp);
 
-       css_get(&memcg->css);
-       commit_charge(page, memcg);
-
-       local_irq_disable();
-       mem_cgroup_charge_statistics(memcg, page, nr_pages);
-       memcg_check_events(memcg, page);
-       local_irq_enable();
+       css_put(&memcg->css);
+       return ret;
+}
 
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
        /*
         * Cgroup1's unified memory+swap counter has been charged with the
         * new swapcache page, finish the transfer by uncharging the swap
@@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
         * correspond 1:1 to page and swap slot lifetimes: we charge the
         * page to memory here, and uncharge swap when the slot is freed.
         */
-       if (do_memsw_account() && PageSwapCache(page)) {
-               swp_entry_t entry = { .val = page_private(page) };
+       if (!mem_cgroup_disabled() && do_memsw_account()) {
                /*
                 * The swap entry might not get freed for a long time,
                 * let's not wait for it.  The page already received a
                 * memory+swap charge, drop the swap entry duplicate.
                 */
-               mem_cgroup_uncharge_swap(entry, nr_pages);
+               mem_cgroup_uncharge_swap(entry, 1);
        }
-
-out_put:
-       css_put(&memcg->css);
-out:
-       return ret;
 }
 
 struct uncharge_gather {
index 550405fc3b5e6396a1dd1fd974dde3e07a3c9d15..3196fa5e7f7d781bb3a96b794feb613108670459 100644 (file)
@@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
                        page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
                                                        vmf->address);
                        if (page) {
-                               int err;
-
                                __SetPageLocked(page);
                                __SetPageSwapBacked(page);
-                               set_page_private(page, entry.val);
 
-                               /* Tell memcg to use swap ownership records */
-                               SetPageSwapCache(page);
-                               err = mem_cgroup_charge(page, vma->vm_mm,
-                                                       GFP_KERNEL);
-                               ClearPageSwapCache(page);
-                               if (err) {
+                               if (mem_cgroup_swapin_charge_page(page,
+                                       vma->vm_mm, GFP_KERNEL, entry)) {
                                        ret = VM_FAULT_OOM;
                                        goto out_page;
                                }
+                               mem_cgroup_swapin_uncharge_swap(entry);
 
                                shadow = get_shadow_from_swap_cache(entry);
                                if (shadow)
                                        workingset_refault(page, shadow);
 
                                lru_cache_add(page);
+
+                               /* To provide entry to swap_readpage() */
+                               set_page_private(page, entry.val);
                                swap_readpage(page, true);
+                               set_page_private(page, 0);
                        }
                } else {
                        page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
index 3cdee7b11da9b678f1bfd2a7ec9e1da9dfdea380..fb7efa08fe577470aea28b1a8bdd2c5075cd6766 100644 (file)
@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        __SetPageLocked(page);
        __SetPageSwapBacked(page);
 
-       /* May fail (-ENOMEM) if XArray node allocation failed. */
-       if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
-               put_swap_page(page, entry);
+       if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
                goto fail_unlock;
-       }
 
-       if (mem_cgroup_charge(page, NULL, gfp_mask)) {
-               delete_from_swap_cache(page);
+       /* May fail (-ENOMEM) if XArray node allocation failed. */
+       if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
                goto fail_unlock;
-       }
+
+       mem_cgroup_swapin_uncharge_swap(entry);
 
        if (shadow)
                workingset_refault(page, shadow);
@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
        return page;
 
 fail_unlock:
+       put_swap_page(page, entry);
        unlock_page(page);
        put_page(page);
        return NULL;