hugetlb_cgroup: add reservation accounting for private mappings

[platform/kernel/linux-rpi.git] / mm / hugetlb.c
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index dd8737a..5b6d83e 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -650,6 +650,25 @@ static void set_vma_private_data(struct vm_area_struct *vma,
         vma->vm_private_data = (void *)value;
  }
  
+static void
+resv_map_set_hugetlb_cgroup_uncharge_info(struct resv_map *resv_map,
+                                         struct hugetlb_cgroup *h_cg,
+                                         struct hstate *h)
+{
+#ifdef CONFIG_CGROUP_HUGETLB
+       if (!h_cg || !h) {
+               resv_map->reservation_counter = NULL;
+               resv_map->pages_per_hpage = 0;
+               resv_map->css = NULL;
+       } else {
+               resv_map->reservation_counter =
+                       &h_cg->rsvd_hugepage[hstate_index(h)];
+               resv_map->pages_per_hpage = pages_per_huge_page(h);
+               resv_map->css = &h_cg->css;
+       }
+#endif
+}
+
  struct resv_map *resv_map_alloc(void)
  {
         struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL);
@@ -666,6 +685,13 @@ struct resv_map *resv_map_alloc(void)
         INIT_LIST_HEAD(&resv_map->regions);
  
         resv_map->adds_in_progress = 0;
+       /*
+        * Initialize these to 0. On shared mappings, 0's here indicate these
+        * fields don't do cgroup accounting. On private mappings, these will be
+        * re-initialized to the proper values, to indicate that hugetlb cgroup
+        * reservations are to be un-charged from here.
+        */
+       resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, NULL, NULL);
  
         INIT_LIST_HEAD(&resv_map->region_cache);
         list_add(&rg->link, &resv_map->region_cache);
@@ -1009,6 +1035,9 @@ static void destroy_compound_gigantic_page(struct page *page,
         struct page *p = page + 1;
  
         atomic_set(compound_mapcount_ptr(page), 0);
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
+
         for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) {
                 clear_compound_head(p);
                 set_page_refcounted(p);
@@ -1069,6 +1098,7 @@ static void update_and_free_page(struct hstate *h, struct page *page)
                                 1 << PG_writeback);
         }
         VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page);
+       VM_BUG_ON_PAGE(hugetlb_cgroup_from_page_rsvd(page), page);
         set_compound_page_dtor(page, NULL_COMPOUND_DTOR);
         set_page_refcounted(page);
         if (hstate_is_gigantic(h)) {
@@ -1254,6 +1284,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid)
         set_compound_page_dtor(page, HUGETLB_PAGE_DTOR);
         spin_lock(&hugetlb_lock);
         set_hugetlb_cgroup(page, NULL);
+       set_hugetlb_cgroup_rsvd(page, NULL);
         h->nr_huge_pages++;
         h->nr_huge_pages_node[nid]++;
         spin_unlock(&hugetlb_lock);
@@ -1287,6 +1318,9 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
                 set_compound_head(p, page);
         }
         atomic_set(compound_mapcount_ptr(page), -1);
+
+       if (hpage_pincount_available(page))
+               atomic_set(compound_pincount_ptr(page), 0);
  }
  
  /*
@@ -1316,6 +1350,106 @@ int PageHeadHuge(struct page *page_head)
         return get_compound_page_dtor(page_head) == free_huge_page;
  }
  
+/*
+ * Find address_space associated with hugetlbfs page.
+ * Upon entry page is locked and page 'was' mapped although mapped state
+ * could change.  If necessary, use anon_vma to find vma and associated
+ * address space.  The returned mapping may be stale, but it can not be
+ * invalid as page lock (which is held) is required to destroy mapping.
+ */
+static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
+{
+       struct anon_vma *anon_vma;
+       pgoff_t pgoff_start, pgoff_end;
+       struct anon_vma_chain *avc;
+       struct address_space *mapping = page_mapping(hpage);
+
+       /* Simple file based mapping */
+       if (mapping)
+               return mapping;
+
+       /*
+        * Even anonymous hugetlbfs mappings are associated with an
+        * underlying hugetlbfs file (see hugetlb_file_setup in mmap
+        * code).  Find a vma associated with the anonymous vma, and
+        * use the file pointer to get address_space.
+        */
+       anon_vma = page_lock_anon_vma_read(hpage);
+       if (!anon_vma)
+               return mapping;  /* NULL */
+
+       /* Use first found vma */
+       pgoff_start = page_to_pgoff(hpage);
+       pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+       anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
+                                       pgoff_start, pgoff_end) {
+               struct vm_area_struct *vma = avc->vma;
+
+               mapping = vma->vm_file->f_mapping;
+               break;
+       }
+
+       anon_vma_unlock_read(anon_vma);
+       return mapping;
+}
+
+/*
+ * Find and lock address space (mapping) in write mode.
+ *
+ * Upon entry, the page is locked which allows us to find the mapping
+ * even in the case of an anon page.  However, locking order dictates
+ * the i_mmap_rwsem be acquired BEFORE the page lock.  This is hugetlbfs
+ * specific.  So, we first try to lock the sema while still holding the
+ * page lock.  If this works, great!  If not, then we need to drop the
+ * page lock and then acquire i_mmap_rwsem and reacquire page lock.  Of
+ * course, need to revalidate state along the way.
+ */
+struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage)
+{
+       struct address_space *mapping, *mapping2;
+
+       mapping = _get_hugetlb_page_mapping(hpage);
+retry:
+       if (!mapping)
+               return mapping;
+
+       /*
+        * If no contention, take lock and return
+        */
+       if (i_mmap_trylock_write(mapping))
+               return mapping;
+
+       /*
+        * Must drop page lock and wait on mapping sema.
+        * Note:  Once page lock is dropped, mapping could become invalid.
+        * As a hack, increase map count until we lock page again.
+        */
+       atomic_inc(&hpage->_mapcount);
+       unlock_page(hpage);
+       i_mmap_lock_write(mapping);
+       lock_page(hpage);
+       atomic_add_negative(-1, &hpage->_mapcount);
+
+       /* verify page is still mapped */
+       if (!page_mapped(hpage)) {
+               i_mmap_unlock_write(mapping);
+               return NULL;
+       }
+
+       /*
+        * Get address space again and verify it is the same one
+        * we locked.  If not, drop lock and retry.
+        */
+       mapping2 = _get_hugetlb_page_mapping(hpage);
+       if (mapping2 != mapping) {
+               i_mmap_unlock_write(mapping);
+               mapping = mapping2;
+               goto retry;
+       }
+
+       return mapping;
+}
+
  pgoff_t __basepage_index(struct page *page)
  {
         struct page *page_head = compound_head(page);
@@ -3188,9 +3322,7 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
         end = vma_hugecache_offset(h, vma, vma->vm_end);
  
         reserve = (end - start) - region_count(resv, start, end);
-
-       kref_put(&resv->refs, resv_map_release);
-
+       hugetlb_cgroup_uncharge_counter(resv, start, end);
         if (reserve) {
                 /*
                  * Decrement reserve counts.  The global reserve count may be
@@ -3199,6 +3331,8 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma)
                 gbl_reserve = hugepage_subpool_put_pages(spool, reserve);
                 hugetlb_acct_memory(h, -gbl_reserve);
         }
+
+       kref_put(&resv->refs, resv_map_release);
  }
  
  static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr)
@@ -3306,6 +3440,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
         int cow;
         struct hstate *h = hstate_vma(vma);
         unsigned long sz = huge_page_size(h);
+       struct address_space *mapping = vma->vm_file->f_mapping;
         struct mmu_notifier_range range;
         int ret = 0;
  
@@ -3316,6 +3451,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
                                         vma->vm_start,
                                         vma->vm_end);
                 mmu_notifier_invalidate_range_start(&range);
+       } else {
+               /*
+                * For shared mappings i_mmap_rwsem must be held to call
+                * huge_pte_alloc, otherwise the returned ptep could go
+                * away if part of a shared pmd and another thread calls
+                * huge_pmd_unshare.
+                */
+               i_mmap_lock_read(mapping);
         }
  
         for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
@@ -3393,6 +3536,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
  
         if (cow)
                 mmu_notifier_invalidate_range_end(&range);
+       else
+               i_mmap_unlock_read(mapping);
  
         return ret;
  }
@@ -3812,16 +3957,17 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm,
         }
  
         /*
-        * Use page lock to guard against racing truncation
-        * before we get page_table_lock.
+        * We can not race with truncation due to holding i_mmap_rwsem.
+        * i_size is modified when holding i_mmap_rwsem, so check here
+        * once for faults beyond end of file.
          */
+       size = i_size_read(mapping->host) >> huge_page_shift(h);
+       if (idx >= size)
+               goto out;
+
  retry:
         page = find_lock_page(mapping, idx);
         if (!page) {
-               size = i_size_read(mapping->host) >> huge_page_shift(h);
-               if (idx >= size)
-                       goto out;
-
                 /*
                  * Check for page in userfault range
                  */
@@ -3841,13 +3987,15 @@ retry:
                         };
  
                         /*
-                        * hugetlb_fault_mutex must be dropped before
-                        * handling userfault.  Reacquire after handling
-                        * fault to make calling code simpler.
+                        * hugetlb_fault_mutex and i_mmap_rwsem must be
+                        * dropped before handling userfault.  Reacquire
+                        * after handling fault to make calling code simpler.
                          */
                         hash = hugetlb_fault_mutex_hash(mapping, idx);
                         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+                       i_mmap_unlock_read(mapping);
                         ret = handle_userfault(&vmf, VM_UFFD_MISSING);
+                       i_mmap_lock_read(mapping);
                         mutex_lock(&hugetlb_fault_mutex_table[hash]);
                         goto out;
                 }
@@ -3925,10 +4073,6 @@ retry:
         }
  
         ptl = huge_pte_lock(h, mm, ptep);
-       size = i_size_read(mapping->host) >> huge_page_shift(h);
-       if (idx >= size)
-               goto backout;
-
         ret = 0;
         if (!huge_pte_none(huge_ptep_get(ptep)))
                 goto backout;
@@ -4012,6 +4156,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
  
         ptep = huge_pte_offset(mm, haddr, huge_page_size(h));
         if (ptep) {
+               /*
+                * Since we hold no locks, ptep could be stale.  That is
+                * OK as we are only making decisions based on content and
+                * not actually modifying content here.
+                */
                 entry = huge_ptep_get(ptep);
                 if (unlikely(is_hugetlb_entry_migration(entry))) {
                         migration_entry_wait_huge(vma, mm, ptep);
@@ -4025,14 +4174,31 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                         return VM_FAULT_OOM;
         }
  
+       /*
+        * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold
+        * until finished with ptep.  This serves two purposes:
+        * 1) It prevents huge_pmd_unshare from being called elsewhere
+        *    and making the ptep no longer valid.
+        * 2) It synchronizes us with i_size modifications during truncation.
+        *
+        * ptep could have already be assigned via huge_pte_offset.  That
+        * is OK, as huge_pte_alloc will return the same value unless
+        * something has changed.
+        */
         mapping = vma->vm_file->f_mapping;
-       idx = vma_hugecache_offset(h, vma, haddr);
+       i_mmap_lock_read(mapping);
+       ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
+       if (!ptep) {
+               i_mmap_unlock_read(mapping);
+               return VM_FAULT_OOM;
+       }
  
         /*
          * Serialize hugepage allocation and instantiation, so that we don't
          * get spurious allocation failures if two CPUs race to instantiate
          * the same page in the page cache.
          */
+       idx = vma_hugecache_offset(h, vma, haddr);
         hash = hugetlb_fault_mutex_hash(mapping, idx);
         mutex_lock(&hugetlb_fault_mutex_table[hash]);
  
@@ -4120,6 +4286,7 @@ out_ptl:
         }
  out_mutex:
         mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+       i_mmap_unlock_read(mapping);
         /*
          * Generally it's safe to hold refcount during waiting page lock. But
          * here we just wait to defer the next page fault to avoid busy loop and
@@ -4266,7 +4433,7 @@ out_release_nounlock:
  long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                          struct page **pages, struct vm_area_struct **vmas,
                          unsigned long *position, unsigned long *nr_pages,
-                        long i, unsigned int flags, int *nonblocking)
+                        long i, unsigned int flags, int *locked)
  {
         unsigned long pfn_offset;
         unsigned long vaddr = *position;
@@ -4337,14 +4504,17 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 spin_unlock(ptl);
                         if (flags & FOLL_WRITE)
                                 fault_flags |= FAULT_FLAG_WRITE;
-                       if (nonblocking)
-                               fault_flags |= FAULT_FLAG_ALLOW_RETRY;
+                       if (locked)
+                               fault_flags |= FAULT_FLAG_ALLOW_RETRY |
+                                       FAULT_FLAG_KILLABLE;
                         if (flags & FOLL_NOWAIT)
                                 fault_flags |= FAULT_FLAG_ALLOW_RETRY |
                                         FAULT_FLAG_RETRY_NOWAIT;
                         if (flags & FOLL_TRIED) {
-                               VM_WARN_ON_ONCE(fault_flags &
-                                               FAULT_FLAG_ALLOW_RETRY);
+                               /*
+                                * Note: FAULT_FLAG_ALLOW_RETRY and
+                                * FAULT_FLAG_TRIED can co-exist
+                                */
                                 fault_flags |= FAULT_FLAG_TRIED;
                         }
                         ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
@@ -4354,9 +4524,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                                 break;
                         }
                         if (ret & VM_FAULT_RETRY) {
-                               if (nonblocking &&
+                               if (locked &&
                                     !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
-                                       *nonblocking = 0;
+                                       *locked = 0;
                                 *nr_pages = 0;
                                 /*
                                  * VM_FAULT_RETRY must not return an
@@ -4376,19 +4546,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 page = pte_page(huge_ptep_get(pte));
  
                 /*
-                * Instead of doing 'try_get_page()' below in the same_page
-                * loop, just check the count once here.
-                */
-               if (unlikely(page_count(page) <= 0)) {
-                       if (pages) {
-                               spin_unlock(ptl);
-                               remainder = 0;
-                               err = -ENOMEM;
-                               break;
-                       }
-               }
-
-               /*
                  * If subpage information not requested, update counters
                  * and skip the same_page loop below.
                  */
@@ -4405,7 +4562,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
  same_page:
                 if (pages) {
                         pages[i] = mem_map_offset(page, pfn_offset);
-                       get_page(pages[i]);
+                       /*
+                        * try_grab_page() should always succeed here, because:
+                        * a) we hold the ptl lock, and b) we've just checked
+                        * that the huge page is present in the page tables. If
+                        * the huge page is present, then the tail pages must
+                        * also be present. The ptl prevents the head page and
+                        * tail pages from being rearranged in any way. So this
+                        * page must be available at this point, unless the page
+                        * refcount overflowed:
+                        */
+                       if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+                               spin_unlock(ptl);
+                               remainder = 0;
+                               err = -ENOMEM;
+                               break;
+                       }
                 }
  
                 if (vmas)
@@ -4545,6 +4717,7 @@ int hugetlb_reserve_pages(struct inode *inode,
         struct hstate *h = hstate_inode(inode);
         struct hugepage_subpool *spool = subpool_inode(inode);
         struct resv_map *resv_map;
+       struct hugetlb_cgroup *h_cg;
         long gbl_reserve;
  
         /* This should never happen */
@@ -4578,12 +4751,26 @@ int hugetlb_reserve_pages(struct inode *inode,
                 chg = region_chg(resv_map, from, to);
  
         } else {
+               /* Private mapping. */
                 resv_map = resv_map_alloc();
                 if (!resv_map)
                         return -ENOMEM;
  
                 chg = to - from;
  
+               if (hugetlb_cgroup_charge_cgroup_rsvd(
+                           hstate_index(h), chg * pages_per_huge_page(h),
+                           &h_cg)) {
+                       kref_put(&resv_map->refs, resv_map_release);
+                       return -ENOMEM;
+               }
+
+               /*
+                * Since this branch handles private mappings, we attach the
+                * counter to uncharge for this reservation off resv_map.
+                */
+               resv_map_set_hugetlb_cgroup_uncharge_info(resv_map, h_cg, h);
+
                 set_vma_resv_map(vma, resv_map);
                 set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
         }
@@ -4765,10 +4952,12 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
   * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc()
   * and returns the corresponding pte. While this is not necessary for the
   * !shared pmd case because we can allocate the pmd later as well, it makes the
- * code much cleaner. pmd allocation is essential for the shared case because
- * pud has to be populated inside the same i_mmap_rwsem section - otherwise
- * racing tasks could either miss the sharing (see huge_pte_offset) or select a
- * bad pmd for sharing.
+ * code much cleaner.
+ *
+ * This routine must be called with i_mmap_rwsem held in at least read mode.
+ * For hugetlbfs, this prevents removal of any page table entries associated
+ * with the address space.  This is important as we are setting up sharing
+ * based on existing page table entries (mappings).
   */
  pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
  {
@@ -4785,7 +4974,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
         if (!vma_shareable(vma, addr))
                 return (pte_t *)pmd_alloc(mm, pud, addr);
  
-       i_mmap_lock_read(mapping);
         vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) {
                 if (svma == vma)
                         continue;
@@ -4815,7 +5003,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
         spin_unlock(ptl);
  out:
         pte = (pte_t *)pmd_alloc(mm, pud, addr);
-       i_mmap_unlock_read(mapping);
         return pte;
  }
  
@@ -4826,7 +5013,7 @@ out:
   * indicated by page_count > 1, unmap is achieved by clearing pud and
   * decrementing the ref count. If count == 1, the pte page is not shared.
   *
- * called with page table lock held.
+ * Called with page table lock held and i_mmap_rwsem held in write mode.
   *
   * returns: 1 successfully unmapped a shared pte page
   *         0 the underlying pte page is not shared, or it is the last user
@@ -4965,6 +5152,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
         struct page *page = NULL;
         spinlock_t *ptl;
         pte_t pte;
+
+       /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+       if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+                        (FOLL_PIN | FOLL_GET)))
+               return NULL;
+
  retry:
         ptl = pmd_lockptr(mm, pmd);
         spin_lock(ptl);
@@ -4977,8 +5170,18 @@ retry:
         pte = huge_ptep_get((pte_t *)pmd);
         if (pte_present(pte)) {
                 page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
-               if (flags & FOLL_GET)
-                       get_page(page);
+               /*
+                * try_grab_page() should always succeed here, because: a) we
+                * hold the pmd (ptl) lock, and b) we've just checked that the
+                * huge pmd (head) page is present in the page tables. The ptl
+                * prevents the head page and tail pages from being rearranged
+                * in any way. So this page must be available at this point,
+                * unless the page refcount overflowed:
+                */
+               if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+                       page = NULL;
+                       goto out;
+               }
         } else {
                 if (is_hugetlb_entry_migration(pte)) {
                         spin_unlock(ptl);
@@ -4999,7 +5202,7 @@ struct page * __weak
  follow_huge_pud(struct mm_struct *mm, unsigned long address,
                 pud_t *pud, int flags)
  {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                 return NULL;
  
         return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5008,7 +5211,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
  struct page * __weak
  follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
  {
-       if (flags & FOLL_GET)
+       if (flags & (FOLL_GET | FOLL_PIN))
                 return NULL;
  
         return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);