mm->map_count++;
}
-static inline void
-__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
- struct vm_area_struct *prev)
+static __always_inline void __vma_unlink_common(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct vm_area_struct *prev,
+ bool has_prev)
{
struct vm_area_struct *next;
vma_rb_erase(vma, &mm->mm_rb);
- prev->vm_next = next = vma->vm_next;
+ next = vma->vm_next;
+ if (has_prev)
+ prev->vm_next = next;
+ else {
+ prev = vma->vm_prev;
+ if (prev)
+ prev->vm_next = next;
+ else
+ mm->mmap = next;
+ }
if (next)
next->vm_prev = prev;
vmacache_invalidate(mm);
}
+static inline void __vma_unlink_prev(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ struct vm_area_struct *prev)
+{
+ __vma_unlink_common(mm, vma, prev, true);
+}
+
+static inline void __vma_unlink(struct mm_struct *mm,
+ struct vm_area_struct *vma)
+{
+ __vma_unlink_common(mm, vma, NULL, false);
+}
+
/*
* We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that
* is already present in an i_mmap tree without adjusting the tree.
* are necessary. The "insert" vma (if any) is to be inserted
* before we drop the necessary locks.
*/
-int vma_adjust(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert)
+int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
+ unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert,
+ struct vm_area_struct *expand)
{
struct mm_struct *mm = vma->vm_mm;
- struct vm_area_struct *next = vma->vm_next;
+ struct vm_area_struct *next = vma->vm_next, *orig_vma = vma;
struct address_space *mapping = NULL;
struct rb_root *root = NULL;
struct anon_vma *anon_vma = NULL;
/*
* vma expands, overlapping all the next, and
* perhaps the one after too (mprotect case 6).
+ * The only two other cases that gets here are
+ * case 1, case 7 and case 8.
*/
- remove_next = 1 + (end > next->vm_end);
- end = next->vm_end;
+ if (next == expand) {
+ /*
+ * The only case where we don't expand "vma"
+ * and we expand "next" instead is case 8.
+ */
+ VM_WARN_ON(end != next->vm_end);
+ /*
+ * remove_next == 3 means we're
+ * removing "vma" and that to do so we
+ * swapped "vma" and "next".
+ */
+ remove_next = 3;
+ VM_WARN_ON(file != next->vm_file);
+ swap(vma, next);
+ } else {
+ VM_WARN_ON(expand != vma);
+ /*
+ * case 1, 6, 7, remove_next == 2 is case 6,
+ * remove_next == 1 is case 1 or 7.
+ */
+ remove_next = 1 + (end > next->vm_end);
+ VM_WARN_ON(remove_next == 2 &&
+ end != next->vm_next->vm_end);
+ VM_WARN_ON(remove_next == 1 &&
+ end != next->vm_end);
+ /* trim end to next, for case 6 first pass */
+ end = next->vm_end;
+ }
+
exporter = next;
importer = vma;
adjust_next = (end - next->vm_start) >> PAGE_SHIFT;
exporter = next;
importer = vma;
+ VM_WARN_ON(expand != importer);
} else if (end < vma->vm_end) {
/*
* vma shrinks, and !insert tells it's not
adjust_next = -((vma->vm_end - end) >> PAGE_SHIFT);
exporter = vma;
importer = next;
+ VM_WARN_ON(expand != importer);
}
/*
}
}
again:
- vma_adjust_trans_huge(vma, start, end, adjust_next);
+ vma_adjust_trans_huge(orig_vma, start, end, adjust_next);
if (file) {
mapping = file->f_mapping;
if (!anon_vma && adjust_next)
anon_vma = next->anon_vma;
if (anon_vma) {
- VM_BUG_ON_VMA(adjust_next && next->anon_vma &&
- anon_vma != next->anon_vma, next);
+ VM_WARN_ON(adjust_next && next->anon_vma &&
+ anon_vma != next->anon_vma);
anon_vma_lock_write(anon_vma);
anon_vma_interval_tree_pre_update_vma(vma);
if (adjust_next)
* vma_merge has merged next into vma, and needs
* us to remove next before dropping the locks.
*/
- __vma_unlink(mm, next, vma);
+ if (remove_next != 3)
+ __vma_unlink_prev(mm, next, vma);
+ else
+ /* vma is not before next if they've been swapped */
+ __vma_unlink(mm, next);
if (file)
__remove_shared_vm_struct(next, file, mapping);
} else if (insert) {
* we must remove another next too. It would clutter
* up the code too much to do both in one go.
*/
- next = vma->vm_next;
+ if (remove_next != 3) {
+ /*
+ * If "next" was removed and vma->vm_end was
+ * expanded (up) over it, in turn
+ * "next->vm_prev->vm_end" changed and the
+ * "vma->vm_next" gap must be updated.
+ */
+ next = vma->vm_next;
+ } else {
+ /*
+ * For the scope of the comment "next" and
+ * "vma" considered pre-swap(): if "vma" was
+ * removed, next->vm_start was expanded (down)
+ * over it and the "next" gap must be updated.
+ * Because of the swap() the post-swap() "vma"
+ * actually points to pre-swap() "next"
+ * (post-swap() "next" as opposed is now a
+ * dangling pointer).
+ */
+ next = vma;
+ }
if (remove_next == 2) {
remove_next = 1;
end = next->vm_end;
* cannot merge might become might become might become
* PPNNNNNNNNNN PPPPPPPPPPNN PPPPPPPPPPPP 6 or
* mmap, brk or case 4 below case 5 below PPPPPPPPXXXX 7 or
- * mremap move: PPPPNNNNNNNN 8
+ * mremap move: PPPPXXXXXXXX 8
* AAAA
* PPPP NNNN PPPPPPPPPPPP PPPPPPPPNNNN PPPPNNNNNNNN
* might become case 1 below case 2 below case 3 below
*
- * Odd one out? Case 8, because it extends NNNN but needs flags of XXXX:
- * mprotect_fixup updates vm_flags & vm_page_prot on successful return.
+ * It is important for case 8 that the the vma NNNN overlapping the
+ * region AAAA is never going to extended over XXXX. Instead XXXX must
+ * be extended in region AAAA and NNNN must be removed. This way in
+ * all cases where vma_merge succeeds, the moment vma_adjust drops the
+ * rmap_locks, the properties of the merged vma will be already
+ * correct for the whole merged range. Some of those properties like
+ * vm_page_prot/vm_flags may be accessed by rmap_walks and they must
+ * be correct for the whole merged range immediately after the
+ * rmap_locks are released. Otherwise if XXXX would be removed and
+ * NNNN would be extended over the XXXX range, remove_migration_ptes
+ * or other rmap walkers (if working on addresses beyond the "end"
+ * parameter) may establish ptes with the wrong permissions of NNNN
+ * instead of the right permissions of XXXX.
*/
struct vm_area_struct *vma_merge(struct mm_struct *mm,
struct vm_area_struct *prev, unsigned long addr,
else
next = mm->mmap;
area = next;
- if (next && next->vm_end == end) /* cases 6, 7, 8 */
+ if (area && area->vm_end == end) /* cases 6, 7, 8 */
next = next->vm_next;
+ /* verify some invariant that must be enforced by the caller */
+ VM_WARN_ON(prev && addr <= prev->vm_start);
+ VM_WARN_ON(area && end > area->vm_end);
+ VM_WARN_ON(addr >= end);
+
/*
* Can it merge with the predecessor?
*/
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
- err = vma_adjust(prev, prev->vm_start,
- next->vm_end, prev->vm_pgoff, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ next->vm_end, prev->vm_pgoff, NULL,
+ prev);
} else /* cases 2, 5, 7 */
- err = vma_adjust(prev, prev->vm_start,
- end, prev->vm_pgoff, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ end, prev->vm_pgoff, NULL, prev);
if (err)
return NULL;
khugepaged_enter_vma_merge(prev, vm_flags);
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
if (prev && addr < prev->vm_end) /* case 4 */
- err = vma_adjust(prev, prev->vm_start,
- addr, prev->vm_pgoff, NULL);
- else /* cases 3, 8 */
- err = vma_adjust(area, addr, next->vm_end,
- next->vm_pgoff - pglen, NULL);
+ err = __vma_adjust(prev, prev->vm_start,
+ addr, prev->vm_pgoff, NULL, next);
+ else { /* cases 3, 8 */
+ err = __vma_adjust(area, addr, next->vm_end,
+ next->vm_pgoff - pglen, NULL, next);
+ /*
+ * In case 3 area is already equal to next and
+ * this is a noop, but in case 8 "area" has
+ * been removed and next was expanded over it.
+ */
+ area = next;
+ }
if (err)
return NULL;
khugepaged_enter_vma_merge(area, vm_flags);