Inter-revision diff: patch 8

Comparing v10 (message) to v2 (message)

--- v10
+++ v2
@@ -1,193 +1,52 @@
-From: Peter Zijlstra <peterz@infradead.org>
+The speculative page fault handler must be protected against anon_vma
+changes. This is because page_add_new_anon_rmap() is called during the
+speculative path.
 
-Wrap the VMA modifications (vma_adjust/unmap_page_range) with sequence
-counts such that we can easily test if a VMA is changed.
+In addition, don't try speculative page fault if the VMA don't have an
+anon_vma structure allocated because its allocation should be
+protected by the mmap_sem.
 
-The unmap_page_range() one allows us to make assumptions about
-page-tables; when we find the seqcount hasn't changed we can assume
-page-tables are still valid.
+In __vma_adjust() when importer->anon_vma is set, there is no need to
+protect against speculative page faults since speculative page fault
+is aborted if the vma->anon_vma is not set.
 
-The flip side is that we cannot distinguish between a vma_adjust() and
-the unmap_page_range() -- where with the former we could have
-re-checked the vma bounds against the address.
+When calling page_add_new_anon_rmap() vma->anon_vma is necessarily
+valid since we checked for it when locking the pte and the anon_vma is
+removed once the pte is unlocked. So even if the speculative page
+fault handler is running concurrently with do_unmap(), as the pte is
+locked in unmap_region() - through unmap_vmas() - and the anon_vma
+unlinked later, because we check for the vma sequence counter which is
+updated in unmap_page_range() before locking the pte, and then in
+free_pgtables() so when locking the pte the change will be detected.
 
-Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
-
-[Port to 4.12 kernel]
-[Build depends on CONFIG_SPECULATIVE_PAGE_FAULT]
-[Introduce vm_write_* inline function depending on
- CONFIG_SPECULATIVE_PAGE_FAULT]
-[Fix lock dependency between mapping->i_mmap_rwsem and vma->vm_sequence by
- using vm_raw_write* functions]
-[Fix a lock dependency warning in mmap_region() when entering the error
- path]
-[move sequence initialisation INIT_VMA()]
 Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
 ---
- include/linux/mm.h       | 44 ++++++++++++++++++++++++++++++++++++++++++++
- include/linux/mm_types.h |  3 +++
- mm/memory.c              |  2 ++
- mm/mmap.c                | 31 +++++++++++++++++++++++++++++++
- 4 files changed, 80 insertions(+)
+ mm/memory.c | 4 ++++
+ 1 file changed, 4 insertions(+)
 
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index efc1248b82bd..988daf7030c9 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -1264,6 +1264,9 @@ struct zap_details {
- static inline void INIT_VMA(struct vm_area_struct *vma)
- {
- 	INIT_LIST_HEAD(&vma->anon_vma_chain);
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+	seqcount_init(&vma->vm_sequence);
-+#endif
- }
- 
- struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
-@@ -1386,6 +1389,47 @@ static inline void unmap_shared_mapping_range(struct address_space *mapping,
- 	unmap_mapping_range(mapping, holebegin, holelen, 0);
- }
- 
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+static inline void vm_write_begin(struct vm_area_struct *vma)
-+{
-+	write_seqcount_begin(&vma->vm_sequence);
-+}
-+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
-+					 int subclass)
-+{
-+	write_seqcount_begin_nested(&vma->vm_sequence, subclass);
-+}
-+static inline void vm_write_end(struct vm_area_struct *vma)
-+{
-+	write_seqcount_end(&vma->vm_sequence);
-+}
-+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
-+{
-+	raw_write_seqcount_begin(&vma->vm_sequence);
-+}
-+static inline void vm_raw_write_end(struct vm_area_struct *vma)
-+{
-+	raw_write_seqcount_end(&vma->vm_sequence);
-+}
-+#else
-+static inline void vm_write_begin(struct vm_area_struct *vma)
-+{
-+}
-+static inline void vm_write_begin_nested(struct vm_area_struct *vma,
-+					 int subclass)
-+{
-+}
-+static inline void vm_write_end(struct vm_area_struct *vma)
-+{
-+}
-+static inline void vm_raw_write_begin(struct vm_area_struct *vma)
-+{
-+}
-+static inline void vm_raw_write_end(struct vm_area_struct *vma)
-+{
-+}
-+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
-+
- extern int access_process_vm(struct task_struct *tsk, unsigned long addr,
- 		void *buf, int len, unsigned int gup_flags);
- extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index 21612347d311..db5e9d630e7a 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -335,6 +335,9 @@ struct vm_area_struct {
- 	struct mempolicy *vm_policy;	/* NUMA policy for the VMA */
- #endif
- 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+	seqcount_t vm_sequence;
-+#endif
- } __randomize_layout;
- 
- struct core_thread {
 diff --git a/mm/memory.c b/mm/memory.c
-index f86efcb8e268..f7fed053df80 100644
+index da3bd07bb052..68e4fdcce692 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
-@@ -1503,6 +1503,7 @@ void unmap_page_range(struct mmu_gather *tlb,
- 	unsigned long next;
+@@ -615,7 +615,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ 		 * Hide vma from rmap and truncate_pagecache before freeing
+ 		 * pgtables
+ 		 */
++		write_seqcount_begin(&vma->vm_sequence);
+ 		unlink_anon_vmas(vma);
++		write_seqcount_end(&vma->vm_sequence);
+ 		unlink_file_vma(vma);
  
- 	BUG_ON(addr >= end);
-+	vm_write_begin(vma);
- 	tlb_start_vma(tlb, vma);
- 	pgd = pgd_offset(vma->vm_mm, addr);
- 	do {
-@@ -1512,6 +1513,7 @@ void unmap_page_range(struct mmu_gather *tlb,
- 		next = zap_p4d_range(tlb, vma, pgd, addr, next, details);
- 	} while (pgd++, addr = next, addr != end);
- 	tlb_end_vma(tlb, vma);
-+	vm_write_end(vma);
- }
- 
- 
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 8bd9ae1dfacc..813e49589ea1 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -692,6 +692,30 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 	long adjust_next = 0;
- 	int remove_next = 0;
- 
-+	/*
-+	 * Why using vm_raw_write*() functions here to avoid lockdep's warning ?
-+	 *
-+	 * Locked is complaining about a theoretical lock dependency, involving
-+	 * 3 locks:
-+	 *   mapping->i_mmap_rwsem --> vma->vm_sequence --> fs_reclaim
-+	 *
-+	 * Here are the major path leading to this dependency :
-+	 *  1. __vma_adjust() mmap_sem  -> vm_sequence -> i_mmap_rwsem
-+	 *  2. move_vmap() mmap_sem -> vm_sequence -> fs_reclaim
-+	 *  3. __alloc_pages_nodemask() fs_reclaim -> i_mmap_rwsem
-+	 *  4. unmap_mapping_range() i_mmap_rwsem -> vm_sequence
-+	 *
-+	 * So there is no way to solve this easily, especially because in
-+	 * unmap_mapping_range() the i_mmap_rwsem is grab while the impacted
-+	 * VMAs are not yet known.
-+	 * However, the way the vm_seq is used is guarantying that we will
-+	 * never block on it since we just check for its value and never wait
-+	 * for it to move, see vma_has_changed() and handle_speculative_fault().
-+	 */
-+	vm_raw_write_begin(vma);
-+	if (next)
-+		vm_raw_write_begin(next);
-+
- 	if (next && !insert) {
- 		struct vm_area_struct *exporter = NULL, *importer = NULL;
- 
-@@ -902,6 +926,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 			anon_vma_merge(vma, next);
- 		mm->map_count--;
- 		mpol_put(vma_policy(next));
-+		vm_raw_write_end(next);
- 		kmem_cache_free(vm_area_cachep, next);
- 		/*
- 		 * In mprotect's case 6 (see comments on vma_merge),
-@@ -916,6 +941,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 			 * "vma->vm_next" gap must be updated.
- 			 */
- 			next = vma->vm_next;
-+			if (next)
-+				vm_raw_write_begin(next);
- 		} else {
- 			/*
- 			 * For the scope of the comment "next" and
-@@ -962,6 +989,10 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- 	if (insert && file)
- 		uprobe_mmap(insert);
- 
-+	if (next && next != vma)
-+		vm_raw_write_end(next);
-+	vm_raw_write_end(vma);
-+
- 	validate_mm(mm);
- 
- 	return 0;
+ 		if (is_vm_hugetlb_page(vma)) {
+@@ -629,7 +631,9 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
+ 			       && !is_vm_hugetlb_page(next)) {
+ 				vma = next;
+ 				next = vma->vm_next;
++				write_seqcount_begin(&vma->vm_sequence);
+ 				unlink_anon_vmas(vma);
++				write_seqcount_end(&vma->vm_sequence);
+ 				unlink_file_vma(vma);
+ 			}
+ 			free_pgd_range(tlb, addr, vma->vm_end,
 -- 
 2.7.4
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help