mm: Remove i_mmap_lock lockbreak

author Peter Zijlstra <a.p.zijlstra@chello.nl>

Wed, 25 May 2011 00:12:04 +0000 (17:12 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Wed, 25 May 2011 15:39:17 +0000 (08:39 -0700)
author Peter Zijlstra <a.p.zijlstra@chello.nl>
Wed, 25 May 2011 00:12:04 +0000 (17:12 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Wed, 25 May 2011 15:39:17 +0000 (08:39 -0700)
diff --git a/fs/inode.c b/fs/inode.c

index 05f4fa5..7a7284c 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -331,7 +331,6 @@ void address_space_init_once(struct address_space *mapping)
         spin_lock_init(&mapping->private_lock);
         INIT_RAW_PRIO_TREE_ROOT(&mapping->i_mmap);
         INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
-       mutex_init(&mapping->unmap_mutex);
  }
  EXPORT_SYMBOL(address_space_init_once);
  
diff --git a/include/linux/fs.h b/include/linux/fs.h

index cdf9495..5d2c86b 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -635,7 +635,6 @@ struct address_space {
         struct prio_tree_root   i_mmap;         /* tree of private and shared mappings */
         struct list_head        i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
         spinlock_t              i_mmap_lock;    /* protect tree, count, list */
-       unsigned int            truncate_count; /* Cover race condition with truncate */
         unsigned long           nrpages;        /* number of total pages */
         pgoff_t                 writeback_index;/* writeback starts here */
         const struct address_space_operations *a_ops;   /* methods */
@@ -644,7 +643,6 @@ struct address_space {
         spinlock_t              private_lock;   /* for use by the address_space */
         struct list_head        private_list;   /* ditto */
         struct address_space    *assoc_mapping; /* ditto */
-       struct mutex            unmap_mutex;    /* to protect unmapping */
  } __attribute__((aligned(sizeof(long))));
         /*
          * On most architectures that alignment is already the case; but
diff --git a/include/linux/mm.h b/include/linux/mm.h

index ffcce9b..2ad0ac8 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -895,8 +895,6 @@ struct zap_details {
         struct address_space *check_mapping;    /* Check page->mapping if set */
         pgoff_t first_index;                    /* Lowest page->index to unmap */
         pgoff_t last_index;                     /* Highest page->index to unmap */
-       spinlock_t *i_mmap_lock;                /* For unmap_mapping_range: */
-       unsigned long truncate_count;           /* Compare vm_truncate_count */
  };
  
  struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index 02aa561..201998e 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -175,7 +175,6 @@ struct vm_area_struct {
                                            units, *not* PAGE_CACHE_SIZE */
         struct file * vm_file;          /* File we map to (can be NULL). */
         void * vm_private_data;         /* was vm_pte (shared mem) */
-       unsigned long vm_truncate_count;/* truncate_count or restart_addr */
  
  #ifndef CONFIG_MMU
         struct vm_region *vm_region;    /* NOMMU mapping region */
diff --git a/kernel/fork.c b/kernel/fork.c

index 2b44d82..4eef925 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -386,7 +386,6 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
                         spin_lock(&mapping->i_mmap_lock);
                         if (tmp->vm_flags & VM_SHARED)
                                 mapping->i_mmap_writable++;
-                       tmp->vm_truncate_count = mpnt->vm_truncate_count;
                         flush_dcache_mmap_lock(mapping);
                         /* insert tmp into the share list, just after mpnt */
                         vma_prio_tree_add(tmp, mpnt);
diff --git a/mm/memory.c b/mm/memory.c

index 17193d7..1865587 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -986,13 +986,13 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
  static unsigned long zap_pte_range(struct mmu_gather *tlb,
                                 struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
-                               long *zap_work, struct zap_details *details)
+                               struct zap_details *details)
  {
         struct mm_struct *mm = tlb->mm;
         int force_flush = 0;
-       pte_t *pte;
-       spinlock_t *ptl;
         int rss[NR_MM_COUNTERS];
+       spinlock_t *ptl;
+       pte_t *pte;
  
  again:
         init_rss_vec(rss);
@@ -1001,12 +1001,9 @@ again:
         do {
                 pte_t ptent = *pte;
                 if (pte_none(ptent)) {
-                       (*zap_work)--;
                         continue;
                 }
  
-               (*zap_work) -= PAGE_SIZE;
-
                 if (pte_present(ptent)) {
                         struct page *page;
  
@@ -1075,7 +1072,7 @@ again:
                                 print_bad_pte(vma, addr, ptent, NULL);
                 }
                 pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
-       } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
+       } while (pte++, addr += PAGE_SIZE, addr != end);
  
         add_mm_rss_vec(mm, rss);
         arch_leave_lazy_mmu_mode();
@@ -1099,7 +1096,7 @@ again:
  static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                                 struct vm_area_struct *vma, pud_t *pud,
                                 unsigned long addr, unsigned long end,
-                               long *zap_work, struct zap_details *details)
+                               struct zap_details *details)
  {
         pmd_t *pmd;
         unsigned long next;
@@ -1111,19 +1108,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
                         if (next-addr != HPAGE_PMD_SIZE) {
                                 VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem));
                                 split_huge_page_pmd(vma->vm_mm, pmd);
-                       } else if (zap_huge_pmd(tlb, vma, pmd)) {
-                               (*zap_work)--;
+                       } else if (zap_huge_pmd(tlb, vma, pmd))
                                 continue;
-                       }
                         /* fall through */
                 }
-               if (pmd_none_or_clear_bad(pmd)) {
-                       (*zap_work)--;
+               if (pmd_none_or_clear_bad(pmd))
                         continue;
-               }
-               next = zap_pte_range(tlb, vma, pmd, addr, next,
-                                               zap_work, details);
-       } while (pmd++, addr = next, (addr != end && *zap_work > 0));
+               next = zap_pte_range(tlb, vma, pmd, addr, next, details);
+               cond_resched();
+       } while (pmd++, addr = next, addr != end);
  
         return addr;
  }
@@ -1131,7 +1124,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
  static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
                                 struct vm_area_struct *vma, pgd_t *pgd,
                                 unsigned long addr, unsigned long end,
-                               long *zap_work, struct zap_details *details)
+                               struct zap_details *details)
  {
         pud_t *pud;
         unsigned long next;
@@ -1139,13 +1132,10 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
         pud = pud_offset(pgd, addr);
         do {
                 next = pud_addr_end(addr, end);
-               if (pud_none_or_clear_bad(pud)) {
-                       (*zap_work)--;
+               if (pud_none_or_clear_bad(pud))
                         continue;
-               }
-               next = zap_pmd_range(tlb, vma, pud, addr, next,
-                                               zap_work, details);
-       } while (pud++, addr = next, (addr != end && *zap_work > 0));
+               next = zap_pmd_range(tlb, vma, pud, addr, next, details);
+       } while (pud++, addr = next, addr != end);
  
         return addr;
  }
@@ -1153,7 +1143,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
  static unsigned long unmap_page_range(struct mmu_gather *tlb,
                                 struct vm_area_struct *vma,
                                 unsigned long addr, unsigned long end,
-                               long *zap_work, struct zap_details *details)
+                               struct zap_details *details)
  {
         pgd_t *pgd;
         unsigned long next;
@@ -1167,13 +1157,10 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb,
         pgd = pgd_offset(vma->vm_mm, addr);
         do {
                 next = pgd_addr_end(addr, end);
-               if (pgd_none_or_clear_bad(pgd)) {
-                       (*zap_work)--;
+               if (pgd_none_or_clear_bad(pgd))
                         continue;
-               }
-               next = zap_pud_range(tlb, vma, pgd, addr, next,
-                                               zap_work, details);
-       } while (pgd++, addr = next, (addr != end && *zap_work > 0));
+               next = zap_pud_range(tlb, vma, pgd, addr, next, details);
+       } while (pgd++, addr = next, addr != end);
         tlb_end_vma(tlb, vma);
         mem_cgroup_uncharge_end();
  
@@ -1218,9 +1205,7 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                 unsigned long end_addr, unsigned long *nr_accounted,
                 struct zap_details *details)
  {
-       long zap_work = ZAP_BLOCK_SIZE;
         unsigned long start = start_addr;
-       spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
         struct mm_struct *mm = vma->vm_mm;
  
         mmu_notifier_invalidate_range_start(mm, start_addr, end_addr);
@@ -1253,33 +1238,15 @@ unsigned long unmap_vmas(struct mmu_gather *tlb,
                                  * Since no pte has actually been setup, it is
                                  * safe to do nothing in this case.
                                  */
-                               if (vma->vm_file) {
+                               if (vma->vm_file)
                                         unmap_hugepage_range(vma, start, end, NULL);
-                                       zap_work -= (end - start) /
-                                       pages_per_huge_page(hstate_vma(vma));
-                               }
  
                                 start = end;
                         } else
-                               start = unmap_page_range(tlb, vma,
-                                               start, end, &zap_work, details);
-
-                       if (zap_work > 0) {
-                               BUG_ON(start != end);
-                               break;
-                       }
-
-                       if (need_resched() ||
-                               (i_mmap_lock && spin_needbreak(i_mmap_lock))) {
-                               if (i_mmap_lock)
-                                       goto out;
-                               cond_resched();
-                       }
-
-                       zap_work = ZAP_BLOCK_SIZE;
+                               start = unmap_page_range(tlb, vma, start, end, details);
                 }
         }
-out:
+
         mmu_notifier_invalidate_range_end(mm, start_addr, end_addr);
         return start;   /* which is now the end (or restart) address */
  }
@@ -2612,96 +2579,11 @@ unwritable_page:
         return ret;
  }
  
-/*
- * Helper functions for unmap_mapping_range().
- *
- * __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
- *
- * We have to restart searching the prio_tree whenever we drop the lock,
- * since the iterator is only valid while the lock is held, and anyway
- * a later vma might be split and reinserted earlier while lock dropped.
- *
- * The list of nonlinear vmas could be handled more efficiently, using
- * a placeholder, but handle it in the same way until a need is shown.
- * It is important to search the prio_tree before nonlinear list: a vma
- * may become nonlinear and be shifted from prio_tree to nonlinear list
- * while the lock is dropped; but never shifted from list to prio_tree.
- *
- * In order to make forward progress despite restarting the search,
- * vm_truncate_count is used to mark a vma as now dealt with, so we can
- * quickly skip it next time around.  Since the prio_tree search only
- * shows us those vmas affected by unmapping the range in question, we
- * can't efficiently keep all vmas in step with mapping->truncate_count:
- * so instead reset them all whenever it wraps back to 0 (then go to 1).
- * mapping->truncate_count and vma->vm_truncate_count are protected by
- * i_mmap_lock.
- *
- * In order to make forward progress despite repeatedly restarting some
- * large vma, note the restart_addr from unmap_vmas when it breaks out:
- * and restart from that address when we reach that vma again.  It might
- * have been split or merged, shrunk or extended, but never shifted: so
- * restart_addr remains valid so long as it remains in the vma's range.
- * unmap_mapping_range forces truncate_count to leap over page-aligned
- * values so we can save vma's restart_addr in its truncate_count field.
- */
-#define is_restart_addr(truncate_count) (!((truncate_count) & ~PAGE_MASK))
-
-static void reset_vma_truncate_counts(struct address_space *mapping)
-{
-       struct vm_area_struct *vma;
-       struct prio_tree_iter iter;
-
-       vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
-               vma->vm_truncate_count = 0;
-       list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
-               vma->vm_truncate_count = 0;
-}
-
-static int unmap_mapping_range_vma(struct vm_area_struct *vma,
+static void unmap_mapping_range_vma(struct vm_area_struct *vma,
                 unsigned long start_addr, unsigned long end_addr,
                 struct zap_details *details)
  {
-       unsigned long restart_addr;
-       int need_break;
-
-       /*
-        * files that support invalidating or truncating portions of the
-        * file from under mmaped areas must have their ->fault function
-        * return a locked page (and set VM_FAULT_LOCKED in the return).
-        * This provides synchronisation against concurrent unmapping here.
-        */
-
-again:
-       restart_addr = vma->vm_truncate_count;
-       if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
-               start_addr = restart_addr;
-               if (start_addr >= end_addr) {
-                       /* Top of vma has been split off since last time */
-                       vma->vm_truncate_count = details->truncate_count;
-                       return 0;
-               }
-       }
-
-       restart_addr = zap_page_range(vma, start_addr,
-                                       end_addr - start_addr, details);
-       need_break = need_resched() || spin_needbreak(details->i_mmap_lock);
-
-       if (restart_addr >= end_addr) {
-               /* We have now completed this vma: mark it so */
-               vma->vm_truncate_count = details->truncate_count;
-               if (!need_break)
-                       return 0;
-       } else {
-               /* Note restart_addr in vma's truncate_count field */
-               vma->vm_truncate_count = restart_addr;
-               if (!need_break)
-                       goto again;
-       }
-
-       spin_unlock(details->i_mmap_lock);
-       cond_resched();
-       spin_lock(details->i_mmap_lock);
-       return -EINTR;
+       zap_page_range(vma, start_addr, end_addr - start_addr, details);
  }
  
  static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
@@ -2711,12 +2593,8 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
         struct prio_tree_iter iter;
         pgoff_t vba, vea, zba, zea;
  
-restart:
         vma_prio_tree_foreach(vma, &iter, root,
                         details->first_index, details->last_index) {
-               /* Skip quickly over those we have already dealt with */
-               if (vma->vm_truncate_count == details->truncate_count)
-                       continue;
  
                 vba = vma->vm_pgoff;
                 vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
@@ -2728,11 +2606,10 @@ restart:
                 if (zea > vea)
                         zea = vea;
  
-               if (unmap_mapping_range_vma(vma,
+               unmap_mapping_range_vma(vma,
                         ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
                         ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
-                               details) < 0)
-                       goto restart;
+                               details);
         }
  }
  
@@ -2747,15 +2624,9 @@ static inline void unmap_mapping_range_list(struct list_head *head,
          * across *all* the pages in each nonlinear VMA, not just the pages
          * whose virtual address lies outside the file truncation point.
          */
-restart:
         list_for_each_entry(vma, head, shared.vm_set.list) {
-               /* Skip quickly over those we have already dealt with */
-               if (vma->vm_truncate_count == details->truncate_count)
-                       continue;
                 details->nonlinear_vma = vma;
-               if (unmap_mapping_range_vma(vma, vma->vm_start,
-                                       vma->vm_end, details) < 0)
-                       goto restart;
+               unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
         }
  }
  
@@ -2794,26 +2665,14 @@ void unmap_mapping_range(struct address_space *mapping,
         details.last_index = hba + hlen - 1;
         if (details.last_index < details.first_index)
                 details.last_index = ULONG_MAX;
-       details.i_mmap_lock = &mapping->i_mmap_lock;
  
-       mutex_lock(&mapping->unmap_mutex);
-       spin_lock(&mapping->i_mmap_lock);
-
-       /* Protect against endless unmapping loops */
-       mapping->truncate_count++;
-       if (unlikely(is_restart_addr(mapping->truncate_count))) {
-               if (mapping->truncate_count == 0)
-                       reset_vma_truncate_counts(mapping);
-               mapping->truncate_count++;
-       }
-       details.truncate_count = mapping->truncate_count;
  
+       spin_lock(&mapping->i_mmap_lock);
         if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
                 unmap_mapping_range_tree(&mapping->i_mmap, &details);
         if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
                 unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
         spin_unlock(&mapping->i_mmap_lock);
-       mutex_unlock(&mapping->unmap_mutex);
  }
  EXPORT_SYMBOL(unmap_mapping_range);
  
diff --git a/mm/mmap.c b/mm/mmap.c

index 40d4998..50cb04b 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -445,10 +445,8 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma,
         if (vma->vm_file)
                 mapping = vma->vm_file->f_mapping;
  
-       if (mapping) {
+       if (mapping)
                 spin_lock(&mapping->i_mmap_lock);
-               vma->vm_truncate_count = mapping->truncate_count;
-       }
  
         __vma_link(mm, vma, prev, rb_link, rb_parent);
         __vma_link_file(vma);
@@ -558,16 +556,7 @@ again:                     remove_next = 1 + (end > next->vm_end);
                 if (!(vma->vm_flags & VM_NONLINEAR))
                         root = &mapping->i_mmap;
                 spin_lock(&mapping->i_mmap_lock);
-               if (importer &&
-                   vma->vm_truncate_count != next->vm_truncate_count) {
-                       /*
-                        * unmap_mapping_range might be in progress:
-                        * ensure that the expanding vma is rescanned.
-                        */
-                       importer->vm_truncate_count = 0;
-               }
                 if (insert) {
-                       insert->vm_truncate_count = vma->vm_truncate_count;
                         /*
                          * Put into prio_tree now, so instantiated pages
                          * are visible to arm/parisc __flush_dcache_page
diff --git a/mm/mremap.c b/mm/mremap.c

index a7c1f9f..909e1e1 100644 (file)
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -94,7 +94,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
                  */
                 mapping = vma->vm_file->f_mapping;
                 spin_lock(&mapping->i_mmap_lock);
-               new_vma->vm_truncate_count = 0;
         }
  
         /*
author	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Wed, 25 May 2011 00:12:04 +0000 (17:12 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Wed, 25 May 2011 15:39:17 +0000 (08:39 -0700)
fs/inode.c		patch \| blob \| history
include/linux/fs.h		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
include/linux/mm_types.h		patch \| blob \| history
kernel/fork.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mremap.c		patch \| blob \| history