x86: add tizen_qemu_x86_defconfig & tizen_qemu_x86_64_defconfig

[platform/kernel/linux-rpi.git] / mm / memory-failure.c
diff --git a/mm/memory-failure.c b/mm/memory-failure.c

index 3e6449f..2ad0f45 100644 (file)
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -57,6 +57,7 @@
  #include <linux/ratelimit.h>
  #include <linux/page-isolation.h>
  #include <linux/pagewalk.h>
+#include <linux/shmem_fs.h>
  #include "internal.h"
  #include "ras/ras_event.h"
  
@@ -700,13 +701,18 @@ static int kill_accessing_process(struct task_struct *p, unsigned long pfn,
         };
         priv.tk.tsk = p;
  
+       if (!p->mm)
+               return -EFAULT;
+
         mmap_read_lock(p->mm);
         ret = walk_page_range(p->mm, 0, TASK_SIZE, &hwp_walk_ops,
                               (void *)&priv);
         if (ret == 1 && priv.tk.addr)
                 kill_proc(&priv.tk, pfn, flags);
+       else
+               ret = 0;
         mmap_read_unlock(p->mm);
-       return ret ? -EFAULT : -EHWPOISON;
+       return ret > 0 ? -EHWPOISON : -EFAULT;
  }
  
  static const char *action_name[] = {
@@ -806,12 +812,44 @@ static int truncate_error_page(struct page *p, unsigned long pfn,
         return ret;
  }
  
+struct page_state {
+       unsigned long mask;
+       unsigned long res;
+       enum mf_action_page_type type;
+
+       /* Callback ->action() has to unlock the relevant page inside it. */
+       int (*action)(struct page_state *ps, struct page *p);
+};
+
+/*
+ * Return true if page is still referenced by others, otherwise return
+ * false.
+ *
+ * The extra_pins is true when one extra refcount is expected.
+ */
+static bool has_extra_refcount(struct page_state *ps, struct page *p,
+                              bool extra_pins)
+{
+       int count = page_count(p) - 1;
+
+       if (extra_pins)
+               count -= 1;
+
+       if (count > 0) {
+               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
+                      page_to_pfn(p), action_page_types[ps->type], count);
+               return true;
+       }
+
+       return false;
+}
+
  /*
   * Error hit kernel page.
   * Do nothing, try to be lucky and not touch this instead. For a few cases we
   * could be more sophisticated.
   */
-static int me_kernel(struct page *p, unsigned long pfn)
+static int me_kernel(struct page_state *ps, struct page *p)
  {
         unlock_page(p);
         return MF_IGNORED;
@@ -820,9 +858,9 @@ static int me_kernel(struct page *p, unsigned long pfn)
  /*
   * Page in unknown state. Do nothing.
   */
-static int me_unknown(struct page *p, unsigned long pfn)
+static int me_unknown(struct page_state *ps, struct page *p)
  {
-       pr_err("Memory failure: %#lx: Unknown page state\n", pfn);
+       pr_err("Memory failure: %#lx: Unknown page state\n", page_to_pfn(p));
         unlock_page(p);
         return MF_FAILED;
  }
@@ -830,10 +868,11 @@ static int me_unknown(struct page *p, unsigned long pfn)
  /*
   * Clean (or cleaned) page cache page.
   */
-static int me_pagecache_clean(struct page *p, unsigned long pfn)
+static int me_pagecache_clean(struct page_state *ps, struct page *p)
  {
         int ret;
         struct address_space *mapping;
+       bool extra_pins;
  
         delete_from_lru_cache(p);
  
@@ -863,13 +902,23 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn)
         }
  
         /*
+        * The shmem page is kept in page cache instead of truncating
+        * so is expected to have an extra refcount after error-handling.
+        */
+       extra_pins = shmem_mapping(mapping);
+
+       /*
          * Truncation is a bit tricky. Enable it per file system for now.
          *
          * Open: to take i_rwsem or not for this? Right now we don't.
          */
-       ret = truncate_error_page(p, pfn, mapping);
+       ret = truncate_error_page(p, page_to_pfn(p), mapping);
+       if (has_extra_refcount(ps, p, extra_pins))
+               ret = MF_FAILED;
+
  out:
         unlock_page(p);
+
         return ret;
  }
  
@@ -878,7 +927,7 @@ out:
   * Issues: when the error hit a hole page the error is not properly
   * propagated.
   */
-static int me_pagecache_dirty(struct page *p, unsigned long pfn)
+static int me_pagecache_dirty(struct page_state *ps, struct page *p)
  {
         struct address_space *mapping = page_mapping(p);
  
@@ -922,7 +971,7 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
                 mapping_set_error(mapping, -EIO);
         }
  
-       return me_pagecache_clean(p, pfn);
+       return me_pagecache_clean(ps, p);
  }
  
  /*
@@ -944,9 +993,10 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn)
   * Clean swap cache pages can be directly isolated. A later page fault will
   * bring in the known good data from disk.
   */
-static int me_swapcache_dirty(struct page *p, unsigned long pfn)
+static int me_swapcache_dirty(struct page_state *ps, struct page *p)
  {
         int ret;
+       bool extra_pins = false;
  
         ClearPageDirty(p);
         /* Trigger EIO in shmem: */
@@ -954,10 +1004,17 @@ static int me_swapcache_dirty(struct page *p, unsigned long pfn)
  
         ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED;
         unlock_page(p);
+
+       if (ret == MF_DELAYED)
+               extra_pins = true;
+
+       if (has_extra_refcount(ps, p, extra_pins))
+               ret = MF_FAILED;
+
         return ret;
  }
  
-static int me_swapcache_clean(struct page *p, unsigned long pfn)
+static int me_swapcache_clean(struct page_state *ps, struct page *p)
  {
         int ret;
  
@@ -965,6 +1022,10 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
  
         ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED;
         unlock_page(p);
+
+       if (has_extra_refcount(ps, p, false))
+               ret = MF_FAILED;
+
         return ret;
  }
  
@@ -974,18 +1035,21 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
   * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
   *   To narrow down kill region to one page, we need to break up pmd.
   */
-static int me_huge_page(struct page *p, unsigned long pfn)
+static int me_huge_page(struct page_state *ps, struct page *p)
  {
         int res;
         struct page *hpage = compound_head(p);
         struct address_space *mapping;
+       bool extra_pins = false;
  
         if (!PageHuge(hpage))
                 return MF_DELAYED;
  
         mapping = page_mapping(hpage);
         if (mapping) {
-               res = truncate_error_page(hpage, pfn, mapping);
+               res = truncate_error_page(hpage, page_to_pfn(p), mapping);
+               /* The page is kept in page cache. */
+               extra_pins = true;
                 unlock_page(hpage);
         } else {
                 res = MF_FAILED;
@@ -1003,6 +1067,9 @@ static int me_huge_page(struct page *p, unsigned long pfn)
                 }
         }
  
+       if (has_extra_refcount(ps, p, extra_pins))
+               res = MF_FAILED;
+
         return res;
  }
  
@@ -1028,14 +1095,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
  #define slab           (1UL << PG_slab)
  #define reserved       (1UL << PG_reserved)
  
-static struct page_state {
-       unsigned long mask;
-       unsigned long res;
-       enum mf_action_page_type type;
-
-       /* Callback ->action() has to unlock the relevant page inside it. */
-       int (*action)(struct page *p, unsigned long pfn);
-} error_states[] = {
+static struct page_state error_states[] = {
         { reserved,     reserved,       MF_MSG_KERNEL,  me_kernel },
         /*
          * free pages are specially detected outside this table:
@@ -1095,19 +1155,10 @@ static int page_action(struct page_state *ps, struct page *p,
                         unsigned long pfn)
  {
         int result;
-       int count;
  
         /* page p should be unlocked after returning from ps->action().  */
-       result = ps->action(p, pfn);
+       result = ps->action(ps, p);
  
-       count = page_count(p) - 1;
-       if (ps->action == me_swapcache_dirty && result == MF_DELAYED)
-               count--;
-       if (count > 0) {
-               pr_err("Memory failure: %#lx: %s still referenced by %d users\n",
-                      pfn, action_page_types[ps->type], count);
-               result = MF_FAILED;
-       }
         action_result(pfn, ps->type, result);
  
         /* Could do more checks here if page looks ok */
@@ -1147,20 +1198,6 @@ static int __get_hwpoison_page(struct page *page)
         if (!HWPoisonHandlable(head))
                 return -EBUSY;
  
-       if (PageTransHuge(head)) {
-               /*
-                * Non anonymous thp exists only in allocation/free time. We
-                * can't handle such a case correctly, so let's give it up.
-                * This should be better than triggering BUG_ON when kernel
-                * tries to touch the "partially handled" page.
-                */
-               if (!PageAnon(head)) {
-                       pr_err("Memory failure: %#lx: non anonymous thp\n",
-                               page_to_pfn(page));
-                       return 0;
-               }
-       }
-
         if (get_page_unless_zero(head)) {
                 if (head == compound_head(page))
                         return 1;
@@ -1229,7 +1266,7 @@ try_again:
         }
  out:
         if (ret == -EIO)
-               dump_page(p, "hwpoison: unhandlable page");
+               pr_err("Memory failure: %#lx: unhandlable page.\n", page_to_pfn(p));
  
         return ret;
  }
@@ -1414,14 +1451,11 @@ static int identify_page_state(unsigned long pfn, struct page *p,
  static int try_to_split_thp_page(struct page *page, const char *msg)
  {
         lock_page(page);
-       if (!PageAnon(page) || unlikely(split_huge_page(page))) {
+       if (unlikely(split_huge_page(page))) {
                 unsigned long pfn = page_to_pfn(page);
  
                 unlock_page(page);
-               if (!PageAnon(page))
-                       pr_info("%s: %#lx: non anonymous thp\n", msg, pfn);
-               else
-                       pr_info("%s: %#lx: thp split failed\n", msg, pfn);
+               pr_info("%s: %#lx: thp split failed\n", msg, pfn);
                 put_page(page);
                 return -EBUSY;
         }
@@ -1430,64 +1464,115 @@ static int try_to_split_thp_page(struct page *page, const char *msg)
         return 0;
  }
  
-static int memory_failure_hugetlb(unsigned long pfn, int flags)
+/*
+ * Called from hugetlb code with hugetlb_lock held.
+ *
+ * Return values:
+ *   0             - free hugepage
+ *   1             - in-use hugepage
+ *   2             - not a hugepage
+ *   -EBUSY        - the hugepage is busy (try to retry)
+ *   -EHWPOISON    - the hugepage is already hwpoisoned
+ */
+int __get_huge_page_for_hwpoison(unsigned long pfn, int flags)
+{
+       struct page *page = pfn_to_page(pfn);
+       struct page *head = compound_head(page);
+       int ret = 2;    /* fallback to normal page handling */
+       bool count_increased = false;
+
+       if (!PageHeadHuge(head))
+               goto out;
+
+       if (flags & MF_COUNT_INCREASED) {
+               ret = 1;
+               count_increased = true;
+       } else if (HPageFreed(head) || HPageMigratable(head)) {
+               ret = get_page_unless_zero(head);
+               if (ret)
+                       count_increased = true;
+       } else {
+               ret = -EBUSY;
+               goto out;
+       }
+
+       if (TestSetPageHWPoison(head)) {
+               ret = -EHWPOISON;
+               goto out;
+       }
+
+       return ret;
+out:
+       if (count_increased)
+               put_page(head);
+       return ret;
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * Taking refcount of hugetlb pages needs extra care about race conditions
+ * with basic operations like hugepage allocation/free/demotion.
+ * So some of prechecks for hwpoison (pinning, and testing/setting
+ * PageHWPoison) should be done in single hugetlb_lock range.
+ */
+static int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
  {
-       struct page *p = pfn_to_page(pfn);
-       struct page *head = compound_head(p);
         int res;
+       struct page *p = pfn_to_page(pfn);
+       struct page *head;
         unsigned long page_flags;
+       bool retry = true;
  
-       if (TestSetPageHWPoison(head)) {
-               pr_err("Memory failure: %#lx: already hardware poisoned\n",
-                      pfn);
-               res = -EHWPOISON;
-               if (flags & MF_ACTION_REQUIRED)
+       *hugetlb = 1;
+retry:
+       res = get_huge_page_for_hwpoison(pfn, flags);
+       if (res == 2) { /* fallback to normal page handling */
+               *hugetlb = 0;
+               return 0;
+       } else if (res == -EHWPOISON) {
+               pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn);
+               if (flags & MF_ACTION_REQUIRED) {
+                       head = compound_head(p);
                         res = kill_accessing_process(current, page_to_pfn(head), flags);
+               }
+               return res;
+       } else if (res == -EBUSY) {
+               if (retry) {
+                       retry = false;
+                       goto retry;
+               }
+               action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
                 return res;
         }
  
-       num_poisoned_pages_inc();
+       head = compound_head(p);
+       lock_page(head);
  
-       if (!(flags & MF_COUNT_INCREASED)) {
-               res = get_hwpoison_page(p, flags);
-               if (!res) {
-                       /*
-                        * Check "filter hit" and "race with other subpage."
-                        */
-                       lock_page(head);
-                       if (PageHWPoison(head)) {
-                               if ((hwpoison_filter(p) && TestClearPageHWPoison(p))
-                                   || (p != head && TestSetPageHWPoison(head))) {
-                                       num_poisoned_pages_dec();
-                                       unlock_page(head);
-                                       return 0;
-                               }
-                       }
-                       unlock_page(head);
-                       res = MF_FAILED;
-                       if (__page_handle_poison(p)) {
-                               page_ref_inc(p);
-                               res = MF_RECOVERED;
-                       }
-                       action_result(pfn, MF_MSG_FREE_HUGE, res);
-                       return res == MF_RECOVERED ? 0 : -EBUSY;
-               } else if (res < 0) {
-                       action_result(pfn, MF_MSG_UNKNOWN, MF_IGNORED);
-                       return -EBUSY;
-               }
+       if (hwpoison_filter(p)) {
+               ClearPageHWPoison(head);
+               res = -EOPNOTSUPP;
+               goto out;
         }
  
-       lock_page(head);
-       page_flags = head->flags;
+       num_poisoned_pages_inc();
  
-       if (!PageHWPoison(head)) {
-               pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
-               num_poisoned_pages_dec();
+       /*
+        * Handling free hugepage.  The possible race with hugepage allocation
+        * or demotion can be prevented by PageHWPoison flag.
+        */
+       if (res == 0) {
                 unlock_page(head);
-               put_page(head);
-               return 0;
+               res = MF_FAILED;
+               if (__page_handle_poison(p)) {
+                       page_ref_inc(p);
+                       res = MF_RECOVERED;
+               }
+               action_result(pfn, MF_MSG_FREE_HUGE, res);
+               return res == MF_RECOVERED ? 0 : -EBUSY;
         }
  
+       page_flags = head->flags;
+
         /*
          * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
          * simply disable it. In order to make it work properly, we need
@@ -1514,6 +1599,12 @@ out:
         unlock_page(head);
         return res;
  }
+#else
+static inline int try_memory_failure_hugetlb(unsigned long pfn, int flags, int *hugetlb)
+{
+       return 0;
+}
+#endif
  
  static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                 struct dev_pagemap *pgmap)
@@ -1550,7 +1641,7 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
                 goto out;
  
         if (hwpoison_filter(page)) {
-               rc = 0;
+               rc = -EOPNOTSUPP;
                 goto unlock;
         }
  
@@ -1601,6 +1692,8 @@ out:
         return rc;
  }
  
+static DEFINE_MUTEX(mf_mutex);
+
  /**
   * memory_failure - Handle memory failure of a page.
   * @pfn: Page Number of the corrupted page
@@ -1617,6 +1710,10 @@ out:
   *
   * Must run in process context (e.g. a work queue) with interrupts
   * enabled and no spinlocks hold.
+ *
+ * Return: 0 for successfully handled the memory error,
+ *         -EOPNOTSUPP for memory_filter() filtered the error event,
+ *         < 0(except -EOPNOTSUPP) on failure.
   */
  int memory_failure(unsigned long pfn, int flags)
  {
@@ -1627,7 +1724,7 @@ int memory_failure(unsigned long pfn, int flags)
         int res = 0;
         unsigned long page_flags;
         bool retry = true;
-       static DEFINE_MUTEX(mf_mutex);
+       int hugetlb = 0;
  
         if (!sysctl_memory_failure_recovery)
                 panic("Memory failure on page %lx", pfn);
@@ -1648,10 +1745,9 @@ int memory_failure(unsigned long pfn, int flags)
         mutex_lock(&mf_mutex);
  
  try_again:
-       if (PageHuge(p)) {
-               res = memory_failure_hugetlb(pfn, flags);
+       res = try_memory_failure_hugetlb(pfn, flags, &hugetlb);
+       if (hugetlb)
                 goto unlock_mutex;
-       }
  
         if (TestSetPageHWPoison(p)) {
                 pr_err("Memory failure: %#lx: already hardware poisoned\n",
@@ -1708,6 +1804,20 @@ try_again:
         }
  
         if (PageTransHuge(hpage)) {
+               /*
+                * The flag must be set after the refcount is bumped
+                * otherwise it may race with THP split.
+                * And the flag can't be set in get_hwpoison_page() since
+                * it is called by soft offline too and it is just called
+                * for !MF_COUNT_INCREASE.  So here seems to be the best
+                * place.
+                *
+                * Don't need care about the above error handling paths for
+                * get_hwpoison_page() since they handle either free page
+                * or unhandlable page.  The refcount is bumped iff the
+                * page is a valid handlable page.
+                */
+               SetPageHasHWPoisoned(hpage);
                 if (try_to_split_thp_page(p, "Memory Failure") < 0) {
                         action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
                         res = -EBUSY;
@@ -1747,21 +1857,12 @@ try_again:
          */
         page_flags = p->flags;
  
-       /*
-        * unpoison always clear PG_hwpoison inside page lock
-        */
-       if (!PageHWPoison(p)) {
-               pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
-               num_poisoned_pages_dec();
-               unlock_page(p);
-               put_page(p);
-               goto unlock_mutex;
-       }
         if (hwpoison_filter(p)) {
                 if (TestClearPageHWPoison(p))
                         num_poisoned_pages_dec();
                 unlock_page(p);
                 put_page(p);
+               res = -EOPNOTSUPP;
                 goto unlock_mutex;
         }
  
@@ -1937,6 +2038,7 @@ int unpoison_memory(unsigned long pfn)
         struct page *page;
         struct page *p;
         int freeit = 0;
+       int ret = 0;
         unsigned long flags = 0;
         static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
                                         DEFAULT_RATELIMIT_BURST);
@@ -1947,39 +2049,30 @@ int unpoison_memory(unsigned long pfn)
         p = pfn_to_page(pfn);
         page = compound_head(p);
  
+       mutex_lock(&mf_mutex);
+
         if (!PageHWPoison(p)) {
                 unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
                                  pfn, &unpoison_rs);
-               return 0;
+               goto unlock_mutex;
         }
  
         if (page_count(page) > 1) {
                 unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
                                  pfn, &unpoison_rs);
-               return 0;
+               goto unlock_mutex;
         }
  
         if (page_mapped(page)) {
                 unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
                                  pfn, &unpoison_rs);
-               return 0;
+               goto unlock_mutex;
         }
  
         if (page_mapping(page)) {
                 unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
                                  pfn, &unpoison_rs);
-               return 0;
-       }
-
-       /*
-        * unpoison_memory() can encounter thp only when the thp is being
-        * worked by memory_failure() and the page lock is not held yet.
-        * In such case, we yield to memory_failure() and make unpoison fail.
-        */
-       if (!PageHuge(page) && PageTransHuge(page)) {
-               unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
-                                pfn, &unpoison_rs);
-               return 0;
+               goto unlock_mutex;
         }
  
         if (!get_hwpoison_page(p, flags)) {
@@ -1987,29 +2080,23 @@ int unpoison_memory(unsigned long pfn)
                         num_poisoned_pages_dec();
                 unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
                                  pfn, &unpoison_rs);
-               return 0;
+               goto unlock_mutex;
         }
  
-       lock_page(page);
-       /*
-        * This test is racy because PG_hwpoison is set outside of page lock.
-        * That's acceptable because that won't trigger kernel panic. Instead,
-        * the PG_hwpoison page will be caught and isolated on the entrance to
-        * the free buddy page pool.
-        */
         if (TestClearPageHWPoison(page)) {
                 unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
                                  pfn, &unpoison_rs);
                 num_poisoned_pages_dec();
                 freeit = 1;
         }
-       unlock_page(page);
  
         put_page(page);
         if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
                 put_page(page);
  
-       return 0;
+unlock_mutex:
+       mutex_unlock(&mf_mutex);
+       return ret;
  }
  EXPORT_SYMBOL(unpoison_memory);
  
@@ -2190,9 +2277,12 @@ int soft_offline_page(unsigned long pfn, int flags)
                 return -EIO;
         }
  
+       mutex_lock(&mf_mutex);
+
         if (PageHWPoison(page)) {
                 pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
                 put_ref_page(ref_page);
+               mutex_unlock(&mf_mutex);
                 return 0;
         }
  
@@ -2206,9 +2296,12 @@ retry:
         } else if (ret == 0) {
                 if (soft_offline_free_page(page) && try_again) {
                         try_again = false;
+                       flags &= ~MF_COUNT_INCREASED;
                         goto retry;
                 }
         }
  
+       mutex_unlock(&mf_mutex);
+
         return ret;
  }