diff v7→v4 (patch 8) | series

Inter-revision diff: patch 8

Comparing v7 (message) to v4 (message)
--- v7
+++ v4
@@ -1,394 +1,52 @@
-The VMA sequence count has been introduced to allow fast detection of
-VMA modification when running a page fault handler without holding
-the mmap_sem.
+The speculative page fault handler must be protected against anon_vma
+changes. This is because page_add_new_anon_rmap() is called during the
+speculative path.
 
-This patch provides protection against the VMA modification done in :
-	- madvise()
-	- mpol_rebind_policy()
-	- vma_replace_policy()
-	- change_prot_numa()
-	- mlock(), munlock()
-	- mprotect()
-	- mmap_region()
-	- collapse_huge_page()
-	- userfaultd registering services
+In addition, don't try speculative page fault if the VMA don't have an
+anon_vma structure allocated because its allocation should be
+protected by the mmap_sem.
 
-In addition, VMA fields which will be read during the speculative fault
-path needs to be written using WRITE_ONCE to prevent write to be split
-and intermediate values to be pushed to other CPUs.
+In __vma_adjust() when importer->anon_vma is set, there is no need to
+protect against speculative page faults since speculative page fault
+is aborted if the vma->anon_vma is not set.
+
+When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
+valid since we checked for it when locking the pte and the anon_vma is
+removed once the pte is unlocked. So even if the speculative page
+fault handler is running concurrently with do_unmap(), as the pte is
+locked in unmap_region() - through unmap_vmas() - and the anon_vma
+unlinked later, because we check for the vma sequence counter which is
+updated in unmap_page_range() before locking the pte, and then in
+free_pgtables() so when locking the pte the change will be detected.
 
 Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
 ---
- fs/proc/task_mmu.c |  5 ++++-
- fs/userfaultfd.c   | 17 +++++++++++++----
- mm/khugepaged.c    |  3 +++
- mm/madvise.c       |  6 +++++-
- mm/mempolicy.c     | 51 ++++++++++++++++++++++++++++++++++-----------------
- mm/mlock.c         | 13 ++++++++-----
- mm/mmap.c          | 17 ++++++++++-------
- mm/mprotect.c      |  4 +++-
- mm/swap_state.c    |  8 ++++++--
- 9 files changed, 86 insertions(+), 38 deletions(-)
+ mm/memory.c | 4 ++++
+ 1 file changed, 4 insertions(+)
 
-diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
-index b66fc8de7d34..adb66e1989ee 100644
---- a/fs/proc/task_mmu.c
-+++ b/fs/proc/task_mmu.c
-@@ -1158,8 +1158,11 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
- 					goto out_mm;
- 				}
- 				for (vma = mm->mmap; vma; vma = vma->vm_next) {
--					vma->vm_flags &= ~VM_SOFTDIRTY;
-+					vm_write_begin(vma);
-+					WRITE_ONCE(vma->vm_flags,
-+						   vma->vm_flags & ~VM_SOFTDIRTY);
- 					vma_set_page_prot(vma);
-+					vm_write_end(vma);
- 				}
- 				downgrade_write(&mm->mmap_sem);
- 				break;
-diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
-index 87a13a7c8270..1da1ba63c7dd 100644
---- a/fs/userfaultfd.c
-+++ b/fs/userfaultfd.c
-@@ -659,8 +659,11 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs)
+diff --git a/mm/memory.c b/mm/memory.c
+index 65cd35b5d0c0..d972ef4e3151 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -618,7 +618,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ 		 * Hide vma from rmap and truncate_pagecache before freeing
+ 		 * pgtables
+ 		 */
++		vm_write_begin(vma);
+ 		unlink_anon_vmas(vma);
++		vm_write_end(vma);
+ 		unlink_file_vma(vma);
  
- 	octx = vma->vm_userfaultfd_ctx.ctx;
- 	if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
-+		vm_write_begin(vma);
- 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
--		vma->vm_flags &= ~(VM_UFFD_WP | VM_UFFD_MISSING);
-+		WRITE_ONCE(vma->vm_flags,
-+			   vma->vm_flags & ~(VM_UFFD_WP | VM_UFFD_MISSING));
-+		vm_write_end(vma);
- 		return 0;
- 	}
- 
-@@ -885,8 +888,10 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
- 			vma = prev;
- 		else
- 			prev = vma;
--		vma->vm_flags = new_flags;
-+		vm_write_begin(vma);
-+		WRITE_ONCE(vma->vm_flags, new_flags);
- 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-+		vm_write_end(vma);
- 	}
- 	up_write(&mm->mmap_sem);
- 	mmput(mm);
-@@ -1434,8 +1439,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
- 		 * the next vma was merged into the current one and
- 		 * the current one has not been updated yet.
- 		 */
--		vma->vm_flags = new_flags;
-+		vm_write_begin(vma);
-+		WRITE_ONCE(vma->vm_flags, new_flags);
- 		vma->vm_userfaultfd_ctx.ctx = ctx;
-+		vm_write_end(vma);
- 
- 	skip:
- 		prev = vma;
-@@ -1592,8 +1599,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
- 		 * the next vma was merged into the current one and
- 		 * the current one has not been updated yet.
- 		 */
--		vma->vm_flags = new_flags;
-+		vm_write_begin(vma);
-+		WRITE_ONCE(vma->vm_flags, new_flags);
- 		vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
-+		vm_write_end(vma);
- 
- 	skip:
- 		prev = vma;
-diff --git a/mm/khugepaged.c b/mm/khugepaged.c
-index b7e2268dfc9a..32314e9e48dd 100644
---- a/mm/khugepaged.c
-+++ b/mm/khugepaged.c
-@@ -1006,6 +1006,7 @@ static void collapse_huge_page(struct mm_struct *mm,
- 	if (mm_find_pmd(mm, address) != pmd)
- 		goto out;
- 
-+	vm_write_begin(vma);
- 	anon_vma_lock_write(vma->anon_vma);
- 
- 	pte = pte_offset_map(pmd, address);
-@@ -1041,6 +1042,7 @@ static void collapse_huge_page(struct mm_struct *mm,
- 		pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- 		spin_unlock(pmd_ptl);
- 		anon_vma_unlock_write(vma->anon_vma);
-+		vm_write_end(vma);
- 		result = SCAN_FAIL;
- 		goto out;
- 	}
-@@ -1075,6 +1077,7 @@ static void collapse_huge_page(struct mm_struct *mm,
- 	set_pmd_at(mm, address, pmd, _pmd);
- 	update_mmu_cache_pmd(vma, address, pmd);
- 	spin_unlock(pmd_ptl);
-+	vm_write_end(vma);
- 
- 	*hpage = NULL;
- 
-diff --git a/mm/madvise.c b/mm/madvise.c
-index 4d3c922ea1a1..e328f7ab5942 100644
---- a/mm/madvise.c
-+++ b/mm/madvise.c
-@@ -184,7 +184,9 @@ static long madvise_behavior(struct vm_area_struct *vma,
- 	/*
- 	 * vm_flags is protected by the mmap_sem held in write mode.
- 	 */
--	vma->vm_flags = new_flags;
-+	vm_write_begin(vma);
-+	WRITE_ONCE(vma->vm_flags, new_flags);
-+	vm_write_end(vma);
- out:
- 	return error;
- }
-@@ -450,9 +452,11 @@ static void madvise_free_page_range(struct mmu_gather *tlb,
- 		.private = tlb,
- 	};
- 
-+	vm_write_begin(vma);
- 	tlb_start_vma(tlb, vma);
- 	walk_page_range(addr, end, &free_walk);
- 	tlb_end_vma(tlb, vma);
-+	vm_write_end(vma);
- }
- 
- static int madvise_free_single_vma(struct vm_area_struct *vma,
-diff --git a/mm/mempolicy.c b/mm/mempolicy.c
-index a8b7d59002e8..49d731c39e4f 100644
---- a/mm/mempolicy.c
-+++ b/mm/mempolicy.c
-@@ -380,8 +380,11 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
- 	struct vm_area_struct *vma;
- 
- 	down_write(&mm->mmap_sem);
--	for (vma = mm->mmap; vma; vma = vma->vm_next)
-+	for (vma = mm->mmap; vma; vma = vma->vm_next) {
-+		vm_write_begin(vma);
- 		mpol_rebind_policy(vma->vm_policy, new);
-+		vm_write_end(vma);
-+	}
- 	up_write(&mm->mmap_sem);
- }
- 
-@@ -554,9 +557,11 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
- {
- 	int nr_updated;
- 
-+	vm_write_begin(vma);
- 	nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
- 	if (nr_updated)
- 		count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
-+	vm_write_end(vma);
- 
- 	return nr_updated;
- }
-@@ -657,6 +662,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
- 	if (IS_ERR(new))
- 		return PTR_ERR(new);
- 
-+	vm_write_begin(vma);
- 	if (vma->vm_ops && vma->vm_ops->set_policy) {
- 		err = vma->vm_ops->set_policy(vma, new);
- 		if (err)
-@@ -664,11 +670,17 @@ static int vma_replace_policy(struct vm_area_struct *vma,
- 	}
- 
- 	old = vma->vm_policy;
--	vma->vm_policy = new; /* protected by mmap_sem */
-+	/*
-+	 * The speculative page fault handler access this field without
-+	 * hodling the mmap_sem.
-+	 */
-+	WRITE_ONCE(vma->vm_policy,  new);
-+	vm_write_end(vma);
- 	mpol_put(old);
- 
- 	return 0;
-  err_out:
-+	vm_write_end(vma);
- 	mpol_put(new);
- 	return err;
- }
-@@ -1552,23 +1564,28 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len,
- struct mempolicy *__get_vma_policy(struct vm_area_struct *vma,
- 						unsigned long addr)
- {
--	struct mempolicy *pol = NULL;
-+	struct mempolicy *pol;
- 
--	if (vma) {
--		if (vma->vm_ops && vma->vm_ops->get_policy) {
--			pol = vma->vm_ops->get_policy(vma, addr);
--		} else if (vma->vm_policy) {
--			pol = vma->vm_policy;
-+	if (!vma)
-+		return NULL;
- 
--			/*
--			 * shmem_alloc_page() passes MPOL_F_SHARED policy with
--			 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
--			 * count on these policies which will be dropped by
--			 * mpol_cond_put() later
--			 */
--			if (mpol_needs_cond_ref(pol))
--				mpol_get(pol);
--		}
-+	if (vma->vm_ops && vma->vm_ops->get_policy)
-+		return vma->vm_ops->get_policy(vma, addr);
-+
-+	/*
-+	 * This could be called without holding the mmap_sem in the
-+	 * speculative page fault handler's path.
-+	 */
-+	pol = READ_ONCE(vma->vm_policy);
-+	if (pol) {
-+		/*
-+		 * shmem_alloc_page() passes MPOL_F_SHARED policy with
-+		 * a pseudo vma whose vma->vm_ops=NULL. Take a reference
-+		 * count on these policies which will be dropped by
-+		 * mpol_cond_put() later
-+		 */
-+		if (mpol_needs_cond_ref(pol))
-+			mpol_get(pol);
- 	}
- 
- 	return pol;
-diff --git a/mm/mlock.c b/mm/mlock.c
-index 74e5a6547c3d..c40285c94ced 100644
---- a/mm/mlock.c
-+++ b/mm/mlock.c
-@@ -445,7 +445,9 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
- void munlock_vma_pages_range(struct vm_area_struct *vma,
- 			     unsigned long start, unsigned long end)
- {
--	vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
-+	vm_write_begin(vma);
-+	WRITE_ONCE(vma->vm_flags, vma->vm_flags & VM_LOCKED_CLEAR_MASK);
-+	vm_write_end(vma);
- 
- 	while (start < end) {
- 		struct page *page;
-@@ -568,10 +570,11 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
- 	 * It's okay if try_to_unmap_one unmaps a page just after we
- 	 * set VM_LOCKED, populate_vma_page_range will bring it back.
- 	 */
--
--	if (lock)
--		vma->vm_flags = newflags;
--	else
-+	if (lock) {
-+		vm_write_begin(vma);
-+		WRITE_ONCE(vma->vm_flags, newflags);
-+		vm_write_end(vma);
-+	} else
- 		munlock_vma_pages_range(vma, start, end);
- 
- out:
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 671940b72696..c4d944e1be23 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -847,17 +847,18 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 	}
- 
- 	if (start != vma->vm_start) {
--		vma->vm_start = start;
-+		WRITE_ONCE(vma->vm_start, start);
- 		start_changed = true;
- 	}
- 	if (end != vma->vm_end) {
--		vma->vm_end = end;
-+		WRITE_ONCE(vma->vm_end, end);
- 		end_changed = true;
- 	}
--	vma->vm_pgoff = pgoff;
-+	WRITE_ONCE(vma->vm_pgoff, pgoff);
- 	if (adjust_next) {
--		next->vm_start += adjust_next << PAGE_SHIFT;
--		next->vm_pgoff += adjust_next;
-+		WRITE_ONCE(next->vm_start,
-+			   next->vm_start + (adjust_next << PAGE_SHIFT));
-+		WRITE_ONCE(next->vm_pgoff, next->vm_pgoff + adjust_next);
- 	}
- 
- 	if (root) {
-@@ -1781,6 +1782,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- out:
- 	perf_event_mmap(vma);
- 
-+	vm_write_begin(vma);
- 	vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT);
- 	if (vm_flags & VM_LOCKED) {
- 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
-@@ -1803,6 +1805,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
- 	vma->vm_flags |= VM_SOFTDIRTY;
- 
- 	vma_set_page_prot(vma);
-+	vm_write_end(vma);
- 
- 	return addr;
- 
-@@ -2431,8 +2434,8 @@ int expand_downwards(struct vm_area_struct *vma,
- 					mm->locked_vm += grow;
- 				vm_stat_account(mm, vma->vm_flags, grow);
- 				anon_vma_interval_tree_pre_update_vma(vma);
--				vma->vm_start = address;
--				vma->vm_pgoff -= grow;
-+				WRITE_ONCE(vma->vm_start, address);
-+				WRITE_ONCE(vma->vm_pgoff, vma->vm_pgoff - grow);
- 				anon_vma_interval_tree_post_update_vma(vma);
- 				vma_gap_update(vma);
- 				spin_unlock(&mm->page_table_lock);
-diff --git a/mm/mprotect.c b/mm/mprotect.c
-index e3309fcf586b..9b7a71c30287 100644
---- a/mm/mprotect.c
-+++ b/mm/mprotect.c
-@@ -366,7 +366,8 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
- 	 * vm_flags and vm_page_prot are protected by the mmap_sem
- 	 * held in write mode.
- 	 */
--	vma->vm_flags = newflags;
-+	vm_write_begin(vma);
-+	WRITE_ONCE(vma->vm_flags, newflags);
- 	dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
- 	vma_set_page_prot(vma);
- 
-@@ -381,6 +382,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
- 			(newflags & VM_WRITE)) {
- 		populate_vma_page_range(vma, start, end, NULL);
- 	}
-+	vm_write_end(vma);
- 
- 	vm_stat_account(mm, oldflags, -nrpages);
- 	vm_stat_account(mm, newflags, nrpages);
-diff --git a/mm/swap_state.c b/mm/swap_state.c
-index 39ae7cfad90f..eeef59c2cd76 100644
---- a/mm/swap_state.c
-+++ b/mm/swap_state.c
-@@ -550,6 +550,10 @@ static unsigned long swapin_nr_pages(unsigned long offset)
-  * the readahead.
-  *
-  * Caller must hold down_read on the vma->vm_mm if vma is not NULL.
-+ * This is needed to ensure the VMA will not be freed in our back. In the case
-+ * of the speculative page fault handler, this cannot happen, even if we don't
-+ * hold the mmap_sem. Callees are assumed to take care of reading VMA's fields
-+ * using READ_ONCE() to read consistent values.
-  */
- struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
- 			struct vm_area_struct *vma, unsigned long addr)
-@@ -643,9 +647,9 @@ static inline void swap_ra_clamp_pfn(struct vm_area_struct *vma,
- 				     unsigned long *start,
- 				     unsigned long *end)
- {
--	*start = max3(lpfn, PFN_DOWN(vma->vm_start),
-+	*start = max3(lpfn, PFN_DOWN(READ_ONCE(vma->vm_start)),
- 		      PFN_DOWN(faddr & PMD_MASK));
--	*end = min3(rpfn, PFN_DOWN(vma->vm_end),
-+	*end = min3(rpfn, PFN_DOWN(READ_ONCE(vma->vm_end)),
- 		    PFN_DOWN((faddr & PMD_MASK) + PMD_SIZE));
- }
- 
+ 		if (is_vm_hugetlb_page(vma)) {
+@@ -632,7 +634,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ 			       && !is_vm_hugetlb_page(next)) {
+ 				vma = next;
+ 				next = vma->vm_next;
++				vm_write_begin(vma);
+ 				unlink_anon_vmas(vma);
++				vm_write_end(vma);
+ 				unlink_file_vma(vma);
+ 			}
+ 			free_pgd_range(tlb, addr, vma->vm_end,
 -- 
 2.7.4
`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help