HWPOISON, hugetlb: soft offlining for hugepage

[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / memory-failure.c
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 6b44e52..74eb425 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
  #include <linux/suspend.h>
  #include <linux/slab.h>
  #include <linux/swapops.h>
+#include <linux/hugetlb.h>
  #include "internal.h"
  
  int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -182,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
   * signal.
   */
  static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
-                       unsigned long pfn)
+                       unsigned long pfn, struct page *page)
  {
         struct siginfo si;
         int ret;
@@ -197,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
  #ifdef __ARCH_SI_TRAPNO
         si.si_trapno = trapno;
  #endif
-       si.si_addr_lsb = PAGE_SHIFT;
+       si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
         /*
          * Don't use force here, it's convenient if the signal
          * can be temporarily blocked.
@@ -234,7 +235,7 @@ void shake_page(struct page *p, int access)
                 int nr;
                 do {
                         nr = shrink_slab(1000, GFP_KERNEL, 1000);
-                       if (page_count(p) == 0)
+                       if (page_count(p) == 1)
                                 break;
                 } while (nr > 10);
         }
@@ -326,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
   * wrong earlier.
   */
  static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
-                         int fail, unsigned long pfn)
+                         int fail, struct page *page, unsigned long pfn)
  {
         struct to_kill *tk, *next;
  
@@ -351,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
                          * process anyways.
                          */
                         else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
-                                             pfn) < 0)
+                                             pfn, page) < 0)
                                 printk(KERN_ERR
                 "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
                                         pfn, tk->tsk->comm, tk->tsk->pid);
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  /*
   * Huge pages. Needs work.
   * Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ *   To narrow down kill region to one page, we need to break up pmd.
   */
  static int me_huge_page(struct page *p, unsigned long pfn)
  {
-       return FAILED;
+       int res = 0;
+       struct page *hpage = compound_head(p);
+       /*
+        * We can safely recover from error on free or reserved (i.e.
+        * not in-use) hugepage by dequeuing it from freelist.
+        * To check whether a hugepage is in-use or not, we can't use
+        * page->lru because it can be used in other hugepage operations,
+        * such as __unmap_hugepage_range() and gather_surplus_pages().
+        * So instead we use page_mapping() and PageAnon().
+        * We assume that this function is called with page lock held,
+        * so there is no race between isolation and mapping/unmapping.
+        */
+       if (!(page_mapping(hpage) || PageAnon(hpage))) {
+               res = dequeue_hwpoisoned_huge_page(hpage);
+               if (!res)
+                       return RECOVERED;
+       }
+       return DELAYED;
  }
  
  /*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
         int ret;
         int i;
         int kill = 1;
+       struct page *hpage = compound_head(p);
  
         if (PageReserved(p) || PageSlab(p))
                 return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * This check implies we don't kill processes if their pages
          * are in the swap cache early. Those are always late kills.
          */
-       if (!page_mapped(p))
+       if (!page_mapped(hpage))
                 return SWAP_SUCCESS;
  
-       if (PageCompound(p) || PageKsm(p))
+       if (PageKsm(p))
                 return SWAP_FAIL;
  
         if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * XXX: the dirty test could be racy: set_page_dirty() may not always
          * be called inside page lock (it's recommended but not enforced).
          */
-       mapping = page_mapping(p);
-       if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
-               if (page_mkclean(p)) {
-                       SetPageDirty(p);
+       mapping = page_mapping(hpage);
+       if (!PageDirty(hpage) && mapping &&
+           mapping_cap_writeback_dirty(mapping)) {
+               if (page_mkclean(hpage)) {
+                       SetPageDirty(hpage);
                 } else {
                         kill = 0;
                         ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * there's nothing that can be done.
          */
         if (kill)
-               collect_procs(p, &tokill);
+               collect_procs(hpage, &tokill);
  
         /*
          * try_to_unmap can fail temporarily due to races.
          * Try a few times (RED-PEN better strategy?)
          */
         for (i = 0; i < N_UNMAP_TRIES; i++) {
-               ret = try_to_unmap(p, ttu);
+               ret = try_to_unmap(hpage, ttu);
                 if (ret == SWAP_SUCCESS)
                         break;
                 pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn,  ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
  
         if (ret != SWAP_SUCCESS)
                 printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
-                               pfn, page_mapcount(p));
+                               pfn, page_mapcount(hpage));
  
         /*
          * Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
          * use a more force-full uncatchable kill to prevent
          * any accesses to the poisoned memory.
          */
-       kill_procs_ao(&tokill, !!PageDirty(p), trapno,
-                     ret != SWAP_SUCCESS, pfn);
+       kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+                     ret != SWAP_SUCCESS, p, pfn);
  
         return ret;
  }
  
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+       int i;
+       int nr_pages = 1 << compound_order(hpage);
+       for (i = 0; i < nr_pages; i++)
+               SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+       int i;
+       int nr_pages = 1 << compound_order(hpage);
+       for (i = 0; i < nr_pages; i++)
+               ClearPageHWPoison(hpage + i);
+}
+
  int __memory_failure(unsigned long pfn, int trapno, int flags)
  {
         struct page_state *ps;
         struct page *p;
+       struct page *hpage;
         int res;
+       unsigned int nr_pages;
  
         if (!sysctl_memory_failure_recovery)
                 panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,18 +968,23 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         }
  
         p = pfn_to_page(pfn);
+       hpage = compound_head(p);
         if (TestSetPageHWPoison(p)) {
                 printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
                 return 0;
         }
  
-       atomic_long_add(1, &mce_bad_pages);
+       nr_pages = 1 << compound_order(hpage);
+       atomic_long_add(nr_pages, &mce_bad_pages);
  
         /*
          * We need/can do nothing about count=0 pages.
          * 1) it's a free page, and therefore in safe hand:
          *    prep_new_page() will be the gate keeper.
-        * 2) it's part of a non-compound high order page.
+        * 2) it's a free hugepage, which is also safe:
+        *    an affected hugepage will be dequeued from hugepage freelist,
+        *    so there's no concern about reusing it ever after.
+        * 3) it's part of a non-compound high order page.
          *    Implies some kernel user: cannot stop them from
          *    R/W the page; let's pray that the page has been
          *    used and will be freed some time later.
@@ -954,10 +992,28 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
          */
         if (!(flags & MF_COUNT_INCREASED) &&
-               !get_page_unless_zero(compound_head(p))) {
+               !get_page_unless_zero(hpage)) {
                 if (is_free_buddy_page(p)) {
                         action_result(pfn, "free buddy", DELAYED);
                         return 0;
+               } else if (PageHuge(hpage)) {
+                       /*
+                        * Check "just unpoisoned", "filter hit", and
+                        * "race with other subpage."
+                        */
+                       lock_page_nosync(hpage);
+                       if (!PageHWPoison(hpage)
+                           || (hwpoison_filter(p) && TestClearPageHWPoison(p))
+                           || (p != hpage && TestSetPageHWPoison(hpage))) {
+                               atomic_long_sub(nr_pages, &mce_bad_pages);
+                               return 0;
+                       }
+                       set_page_hwpoison_huge_page(hpage);
+                       res = dequeue_hwpoisoned_huge_page(hpage);
+                       action_result(pfn, "free huge",
+                                     res ? IGNORED : DELAYED);
+                       unlock_page(hpage);
+                       return res;
                 } else {
                         action_result(pfn, "high order kernel", IGNORED);
                         return -EBUSY;
@@ -972,9 +1028,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * The check (unnecessarily) ignores LRU pages being isolated and
          * walked by the page reclaim code, however that's not a big loss.
          */
-       if (!PageLRU(p))
+       if (!PageLRU(p) && !PageHuge(p))
                 shake_page(p, 0);
-       if (!PageLRU(p)) {
+       if (!PageLRU(p) && !PageHuge(p)) {
                 /*
                  * shake_page could have turned it free.
                  */
@@ -992,7 +1048,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
          * It's very difficult to mess with pages currently under IO
          * and in many cases impossible, so we just avoid it here.
          */
-       lock_page_nosync(p);
+       lock_page_nosync(hpage);
  
         /*
          * unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1060,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
         }
         if (hwpoison_filter(p)) {
                 if (TestClearPageHWPoison(p))
-                       atomic_long_dec(&mce_bad_pages);
-               unlock_page(p);
-               put_page(p);
+                       atomic_long_sub(nr_pages, &mce_bad_pages);
+               unlock_page(hpage);
+               put_page(hpage);
+               return 0;
+       }
+
+       /*
+        * For error on the tail page, we should set PG_hwpoison
+        * on the head page to show that the hugepage is hwpoisoned
+        */
+       if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+               action_result(pfn, "hugepage already hardware poisoned",
+                               IGNORED);
+               unlock_page(hpage);
+               put_page(hpage);
                 return 0;
         }
+       /*
+        * Set PG_hwpoison on all pages in an error hugepage,
+        * because containment is done in hugepage unit for now.
+        * Since we have done TestSetPageHWPoison() for the head page with
+        * page lock held, we can safely set PG_hwpoison bits on tail pages.
+        */
+       if (PageHuge(p))
+               set_page_hwpoison_huge_page(hpage);
  
         wait_on_page_writeback(p);
  
@@ -1039,7 +1115,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
                 }
         }
  out:
-       unlock_page(p);
+       unlock_page(hpage);
         return res;
  }
  EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1159,7 @@ int unpoison_memory(unsigned long pfn)
         struct page *page;
         struct page *p;
         int freeit = 0;
+       unsigned int nr_pages;
  
         if (!pfn_valid(pfn))
                 return -ENXIO;
@@ -1095,9 +1172,21 @@ int unpoison_memory(unsigned long pfn)
                 return 0;
         }
  
+       nr_pages = 1 << compound_order(page);
+
         if (!get_page_unless_zero(page)) {
+               /*
+                * Since HWPoisoned hugepage should have non-zero refcount,
+                * race between memory failure and unpoison seems to happen.
+                * In such case unpoison fails and memory failure runs
+                * to the end.
+                */
+               if (PageHuge(page)) {
+                       pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+                       return 0;
+               }
                 if (TestClearPageHWPoison(p))
-                       atomic_long_dec(&mce_bad_pages);
+                       atomic_long_sub(nr_pages, &mce_bad_pages);
                 pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
                 return 0;
         }
@@ -1109,11 +1198,13 @@ int unpoison_memory(unsigned long pfn)
          * the PG_hwpoison page will be caught and isolated on the entrance to
          * the free buddy page pool.
          */
-       if (TestClearPageHWPoison(p)) {
+       if (TestClearPageHWPoison(page)) {
                 pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
-               atomic_long_dec(&mce_bad_pages);
+               atomic_long_sub(nr_pages, &mce_bad_pages);
                 freeit = 1;
         }
+       if (PageHuge(p))
+               clear_page_hwpoison_huge_page(page);
         unlock_page(page);
  
         put_page(page);
@@ -1127,7 +1218,11 @@ EXPORT_SYMBOL(unpoison_memory);
  static struct page *new_page(struct page *p, unsigned long private, int **x)
  {
         int nid = page_to_nid(p);
-       return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
+       if (PageHuge(p))
+               return alloc_huge_page_node(page_hstate(compound_head(p)),
+                                                  nid);
+       else
+               return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0);
  }
  
  /*
@@ -1155,8 +1250,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
          * was free.
          */
         set_migratetype_isolate(p);
+       /*
+        * When the target page is a free hugepage, just remove it
+        * from free hugepage list.
+        */
         if (!get_page_unless_zero(compound_head(p))) {
-               if (is_free_buddy_page(p)) {
+               if (PageHuge(p)) {
+                       pr_debug("get_any_page: %#lx free huge page\n", pfn);
+                       ret = dequeue_hwpoisoned_huge_page(compound_head(p));
+               } else if (is_free_buddy_page(p)) {
                         pr_debug("get_any_page: %#lx free buddy page\n", pfn);
                         /* Set hwpoison bit while page is still isolated */
                         SetPageHWPoison(p);
@@ -1175,6 +1277,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags)
         return ret;
  }
  
+static int soft_offline_huge_page(struct page *page, int flags)
+{
+       int ret;
+       unsigned long pfn = page_to_pfn(page);
+       struct page *hpage = compound_head(page);
+       LIST_HEAD(pagelist);
+
+       ret = get_any_page(page, pfn, flags);
+       if (ret < 0)
+               return ret;
+       if (ret == 0)
+               goto done;
+
+       if (PageHWPoison(hpage)) {
+               put_page(hpage);
+               pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+               return -EBUSY;
+       }
+
+       /* Keep page count to indicate a given hugepage is isolated. */
+
+       list_add(&hpage->lru, &pagelist);
+       ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0);
+       if (ret) {
+               pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
+                        pfn, ret, page->flags);
+               if (ret > 0)
+                       ret = -EIO;
+               return ret;
+       }
+done:
+       if (!PageHWPoison(hpage))
+               atomic_long_add(1 << compound_order(hpage), &mce_bad_pages);
+       set_page_hwpoison_huge_page(hpage);
+       dequeue_hwpoisoned_huge_page(hpage);
+       /* keep elevated page count for bad page */
+       return ret;
+}
+
  /**
   * soft_offline_page - Soft offline a page.
   * @page: page to offline
@@ -1202,6 +1343,9 @@ int soft_offline_page(struct page *page, int flags)
         int ret;
         unsigned long pfn = page_to_pfn(page);
  
+       if (PageHuge(page))
+               return soft_offline_huge_page(page, flags);
+
         ret = get_any_page(page, pfn, flags);
         if (ret < 0)
                 return ret;