mm/compaction: avoid rescanning pageblocks in isolate_freepages
[platform/adaptation/renesas_rcar/renesas_kernel.git] / mm / shmem.c
index 1f18c9d..0f14475 100644 (file)
@@ -80,11 +80,12 @@ static struct vfsmount *shm_mnt;
 #define SHORT_SYMLINK_LEN 128
 
 /*
- * shmem_fallocate and shmem_writepage communicate via inode->i_private
- * (with i_mutex making sure that it has only one user at a time):
- * we would prefer not to enlarge the shmem inode just for that.
+ * shmem_fallocate communicates with shmem_fault or shmem_writepage via
+ * inode->i_private (with i_mutex making sure that it has only one user at
+ * a time): we would prefer not to enlarge the shmem inode just for that.
  */
 struct shmem_falloc {
+       wait_queue_head_t *waitq; /* faults into hole wait for punch to end */
        pgoff_t start;          /* start of range currently being fallocated */
        pgoff_t next;           /* the next page offset to be fallocated */
        pgoff_t nr_falloced;    /* how many new pages have been fallocated */
@@ -242,19 +243,17 @@ static int shmem_radix_tree_replace(struct address_space *mapping,
                        pgoff_t index, void *expected, void *replacement)
 {
        void **pslot;
-       void *item = NULL;
+       void *item;
 
        VM_BUG_ON(!expected);
+       VM_BUG_ON(!replacement);
        pslot = radix_tree_lookup_slot(&mapping->page_tree, index);
-       if (pslot)
-               item = radix_tree_deref_slot_protected(pslot,
-                                                       &mapping->tree_lock);
+       if (!pslot)
+               return -ENOENT;
+       item = radix_tree_deref_slot_protected(pslot, &mapping->tree_lock);
        if (item != expected)
                return -ENOENT;
-       if (replacement)
-               radix_tree_replace_slot(pslot, replacement);
-       else
-               radix_tree_delete(&mapping->page_tree, index);
+       radix_tree_replace_slot(pslot, replacement);
        return 0;
 }
 
@@ -331,84 +330,20 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
 }
 
 /*
- * Like find_get_pages, but collecting swap entries as well as pages.
- */
-static unsigned shmem_find_get_pages_and_swap(struct address_space *mapping,
-                                       pgoff_t start, unsigned int nr_pages,
-                                       struct page **pages, pgoff_t *indices)
-{
-       void **slot;
-       unsigned int ret = 0;
-       struct radix_tree_iter iter;
-
-       if (!nr_pages)
-               return 0;
-
-       rcu_read_lock();
-restart:
-       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
-               struct page *page;
-repeat:
-               page = radix_tree_deref_slot(slot);
-               if (unlikely(!page))
-                       continue;
-               if (radix_tree_exception(page)) {
-                       if (radix_tree_deref_retry(page))
-                               goto restart;
-                       /*
-                        * Otherwise, we must be storing a swap entry
-                        * here as an exceptional entry: so return it
-                        * without attempting to raise page count.
-                        */
-                       goto export;
-               }
-               if (!page_cache_get_speculative(page))
-                       goto repeat;
-
-               /* Has the page moved? */
-               if (unlikely(page != *slot)) {
-                       page_cache_release(page);
-                       goto repeat;
-               }
-export:
-               indices[ret] = iter.index;
-               pages[ret] = page;
-               if (++ret == nr_pages)
-                       break;
-       }
-       rcu_read_unlock();
-       return ret;
-}
-
-/*
  * Remove swap entry from radix tree, free the swap and its page cache.
  */
 static int shmem_free_swap(struct address_space *mapping,
                           pgoff_t index, void *radswap)
 {
-       int error;
+       void *old;
 
        spin_lock_irq(&mapping->tree_lock);
-       error = shmem_radix_tree_replace(mapping, index, radswap, NULL);
+       old = radix_tree_delete_item(&mapping->page_tree, index, radswap);
        spin_unlock_irq(&mapping->tree_lock);
-       if (!error)
-               free_swap_and_cache(radix_to_swp_entry(radswap));
-       return error;
-}
-
-/*
- * Pagevec may contain swap entries, so shuffle up pages before releasing.
- */
-static void shmem_deswap_pagevec(struct pagevec *pvec)
-{
-       int i, j;
-
-       for (i = 0, j = 0; i < pagevec_count(pvec); i++) {
-               struct page *page = pvec->pages[i];
-               if (!radix_tree_exceptional_entry(page))
-                       pvec->pages[j++] = page;
-       }
-       pvec->nr = j;
+       if (old != radswap)
+               return -ENOENT;
+       free_swap_and_cache(radix_to_swp_entry(radswap));
+       return 0;
 }
 
 /*
@@ -429,12 +364,12 @@ void shmem_unlock_mapping(struct address_space *mapping)
                 * Avoid pagevec_lookup(): find_get_pages() returns 0 as if it
                 * has finished, if it hits a row of PAGEVEC_SIZE swap entries.
                 */
-               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                                       PAGEVEC_SIZE, pvec.pages, indices);
+               pvec.nr = find_get_entries(mapping, index,
+                                          PAGEVEC_SIZE, pvec.pages, indices);
                if (!pvec.nr)
                        break;
                index = indices[pvec.nr - 1] + 1;
-               shmem_deswap_pagevec(&pvec);
+               pagevec_remove_exceptionals(&pvec);
                check_move_unevictable_pages(pvec.pages, pvec.nr);
                pagevec_release(&pvec);
                cond_resched();
@@ -466,9 +401,9 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
        pagevec_init(&pvec, 0);
        index = start;
        while (index < end) {
-               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
-                               min(end - index, (pgoff_t)PAGEVEC_SIZE),
-                                                       pvec.pages, indices);
+               pvec.nr = find_get_entries(mapping, index,
+                       min(end - index, (pgoff_t)PAGEVEC_SIZE),
+                       pvec.pages, indices);
                if (!pvec.nr)
                        break;
                mem_cgroup_uncharge_start();
@@ -497,7 +432,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        }
                        unlock_page(page);
                }
-               shmem_deswap_pagevec(&pvec);
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                cond_resched();
@@ -533,22 +468,20 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                return;
 
        index = start;
-       for ( ; ; ) {
+       while (index < end) {
                cond_resched();
-               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+
+               pvec.nr = find_get_entries(mapping, index,
                                min(end - index, (pgoff_t)PAGEVEC_SIZE),
-                                                       pvec.pages, indices);
+                               pvec.pages, indices);
                if (!pvec.nr) {
-                       if (index == start || unfalloc)
+                       /* If all gone or hole-punch or unfalloc, we're done */
+                       if (index == start || end != -1)
                                break;
+                       /* But if truncating, restart to make sure all gone */
                        index = start;
                        continue;
                }
-               if ((index == start || unfalloc) && indices[0] >= end) {
-                       shmem_deswap_pagevec(&pvec);
-                       pagevec_release(&pvec);
-                       break;
-               }
                mem_cgroup_uncharge_start();
                for (i = 0; i < pagevec_count(&pvec); i++) {
                        struct page *page = pvec.pages[i];
@@ -560,8 +493,12 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                        if (radix_tree_exceptional_entry(page)) {
                                if (unfalloc)
                                        continue;
-                               nr_swaps_freed += !shmem_free_swap(mapping,
-                                                               index, page);
+                               if (shmem_free_swap(mapping, index, page)) {
+                                       /* Swap was replaced by page: retry */
+                                       index--;
+                                       break;
+                               }
+                               nr_swaps_freed++;
                                continue;
                        }
 
@@ -570,11 +507,16 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
                                if (page->mapping == mapping) {
                                        VM_BUG_ON_PAGE(PageWriteback(page), page);
                                        truncate_inode_page(mapping, page);
+                               } else {
+                                       /* Page was replaced by swap: retry */
+                                       unlock_page(page);
+                                       index--;
+                                       break;
                                }
                        }
                        unlock_page(page);
                }
-               shmem_deswap_pagevec(&pvec);
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                mem_cgroup_uncharge_end();
                index++;
@@ -824,6 +766,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
                        spin_lock(&inode->i_lock);
                        shmem_falloc = inode->i_private;
                        if (shmem_falloc &&
+                           !shmem_falloc->waitq &&
                            index >= shmem_falloc->start &&
                            index < shmem_falloc->next)
                                shmem_falloc->nr_unswapped++;
@@ -1080,7 +1023,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
                return -EFBIG;
 repeat:
        swap.val = 0;
-       page = find_lock_page(mapping, index);
+       page = find_lock_entry(mapping, index);
        if (radix_tree_exceptional_entry(page)) {
                swap = radix_to_swp_entry(page);
                page = NULL;
@@ -1298,6 +1241,64 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
        int error;
        int ret = VM_FAULT_LOCKED;
 
+       /*
+        * Trinity finds that probing a hole which tmpfs is punching can
+        * prevent the hole-punch from ever completing: which in turn
+        * locks writers out with its hold on i_mutex.  So refrain from
+        * faulting pages into the hole while it's being punched.  Although
+        * shmem_undo_range() does remove the additions, it may be unable to
+        * keep up, as each new page needs its own unmap_mapping_range() call,
+        * and the i_mmap tree grows ever slower to scan if new vmas are added.
+        *
+        * It does not matter if we sometimes reach this check just before the
+        * hole-punch begins, so that one fault then races with the punch:
+        * we just need to make racing faults a rare case.
+        *
+        * The implementation below would be much simpler if we just used a
+        * standard mutex or completion: but we cannot take i_mutex in fault,
+        * and bloating every shmem inode for this unlikely case would be sad.
+        */
+       if (unlikely(inode->i_private)) {
+               struct shmem_falloc *shmem_falloc;
+
+               spin_lock(&inode->i_lock);
+               shmem_falloc = inode->i_private;
+               if (shmem_falloc &&
+                   shmem_falloc->waitq &&
+                   vmf->pgoff >= shmem_falloc->start &&
+                   vmf->pgoff < shmem_falloc->next) {
+                       wait_queue_head_t *shmem_falloc_waitq;
+                       DEFINE_WAIT(shmem_fault_wait);
+
+                       ret = VM_FAULT_NOPAGE;
+                       if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
+                          !(vmf->flags & FAULT_FLAG_RETRY_NOWAIT)) {
+                               /* It's polite to up mmap_sem if we can */
+                               up_read(&vma->vm_mm->mmap_sem);
+                               ret = VM_FAULT_RETRY;
+                       }
+
+                       shmem_falloc_waitq = shmem_falloc->waitq;
+                       prepare_to_wait(shmem_falloc_waitq, &shmem_fault_wait,
+                                       TASK_UNINTERRUPTIBLE);
+                       spin_unlock(&inode->i_lock);
+                       schedule();
+
+                       /*
+                        * shmem_falloc_waitq points into the shmem_fallocate()
+                        * stack of the hole-punching task: shmem_falloc_waitq
+                        * is usually invalid by the time we reach here, but
+                        * finish_wait() does not dereference it in that case;
+                        * though i_lock needed lest racing with wake_up_all().
+                        */
+                       spin_lock(&inode->i_lock);
+                       finish_wait(shmem_falloc_waitq, &shmem_fault_wait);
+                       spin_unlock(&inode->i_lock);
+                       return ret;
+               }
+               spin_unlock(&inode->i_lock);
+       }
+
        error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret);
        if (error)
                return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS);
@@ -1417,6 +1418,11 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
        return inode;
 }
 
+bool shmem_mapping(struct address_space *mapping)
+{
+       return mapping->backing_dev_info == &shmem_backing_dev_info;
+}
+
 #ifdef CONFIG_TMPFS
 static const struct inode_operations shmem_symlink_inode_operations;
 static const struct inode_operations shmem_short_symlink_operations;
@@ -1729,7 +1735,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
        pagevec_init(&pvec, 0);
        pvec.nr = 1;            /* start small: we may be there already */
        while (!done) {
-               pvec.nr = shmem_find_get_pages_and_swap(mapping, index,
+               pvec.nr = find_get_entries(mapping, index,
                                        pvec.nr, pvec.pages, indices);
                if (!pvec.nr) {
                        if (whence == SEEK_DATA)
@@ -1756,7 +1762,7 @@ static pgoff_t shmem_seek_hole_data(struct address_space *mapping,
                                break;
                        }
                }
-               shmem_deswap_pagevec(&pvec);
+               pagevec_remove_exceptionals(&pvec);
                pagevec_release(&pvec);
                pvec.nr = PAGEVEC_SIZE;
                cond_resched();
@@ -1817,12 +1823,25 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                struct address_space *mapping = file->f_mapping;
                loff_t unmap_start = round_up(offset, PAGE_SIZE);
                loff_t unmap_end = round_down(offset + len, PAGE_SIZE) - 1;
+               DECLARE_WAIT_QUEUE_HEAD_ONSTACK(shmem_falloc_waitq);
+
+               shmem_falloc.waitq = &shmem_falloc_waitq;
+               shmem_falloc.start = unmap_start >> PAGE_SHIFT;
+               shmem_falloc.next = (unmap_end + 1) >> PAGE_SHIFT;
+               spin_lock(&inode->i_lock);
+               inode->i_private = &shmem_falloc;
+               spin_unlock(&inode->i_lock);
 
                if ((u64)unmap_end > (u64)unmap_start)
                        unmap_mapping_range(mapping, unmap_start,
                                            1 + unmap_end - unmap_start, 0);
                shmem_truncate_range(inode, offset, offset + len - 1);
                /* No need to unmap again: hole-punching leaves COWed pages */
+
+               spin_lock(&inode->i_lock);
+               inode->i_private = NULL;
+               wake_up_all(&shmem_falloc_waitq);
+               spin_unlock(&inode->i_lock);
                error = 0;
                goto out;
        }
@@ -1840,6 +1859,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
                goto out;
        }
 
+       shmem_falloc.waitq = NULL;
        shmem_falloc.start = start;
        shmem_falloc.next  = start;
        shmem_falloc.nr_falloced = 0;
@@ -2063,8 +2083,10 @@ static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct
 
        if (new_dentry->d_inode) {
                (void) shmem_unlink(new_dir, new_dentry);
-               if (they_are_dirs)
+               if (they_are_dirs) {
+                       drop_nlink(new_dentry->d_inode);
                        drop_nlink(old_dir);
+               }
        } else if (they_are_dirs) {
                drop_nlink(old_dir);
                inc_nlink(new_dir);