Inter-revision diff: patch 13

Comparing v6 (message) to v2 (message)

--- v6
+++ v2
@@ -1,69 +1,131 @@
-The current maybe_mkwrite() is getting passed the pointer to the vma
-structure to fetch the vm_flags field.
+When dealing with speculative page fault handler, we may race with VMA
+being split or merged. In this case the vma->vm_start and vm->vm_end
+fields may not match the address the page fault is occurring.
 
-When dealing with the speculative page fault handler, it will be better to
-rely on the cached vm_flags value stored in the vm_fault structure.
+This can only happens when the VMA is split but in that case, the
+anon_vma pointer of the new VMA will be the same as the original one,
+because in __split_vma the new->anon_vma is set to src->anon_vma when
+*new = *vma.
 
-This patch introduce a __maybe_mkwrite() service which can be called by
-passing the value of the vm_flags field.
+So even if the VMA boundaries are not correct, the anon_vma pointer is
+still valid.
 
-There is no change functional changes expected for the other callers of
-maybe_mkwrite().
+If the VMA has been merged, then the VMA in which it has been merged
+must have the same anon_vma pointer otherwise the merge can't be done.
+
+So in all the case we know that the anon_vma is valid, since we have
+checked before starting the speculative page fault that the anon_vma
+pointer is valid for this VMA and since there is an anon_vma this
+means that at one time a page has been backed and that before the VMA
+is cleaned, the page table lock would have to be grab to clean the
+PTE, and the anon_vma field is checked once the PTE is locked.
+
+This patch introduce a new __page_add_new_anon_rmap() service which
+doesn't check for the VMA boundaries, and create a new inline one
+which do the check.
+
+When called from a page fault handler, if this is not a speculative one,
+there is a guarantee that vm_start and vm_end match the faulting address,
+so this check is useless. In the context of the speculative page fault
+handler, this check may be wrong but anon_vma is still valid as explained
+above.
 
 Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
 ---
- include/linux/mm.h | 9 +++++++--
- mm/memory.c        | 6 +++---
- 2 files changed, 10 insertions(+), 5 deletions(-)
+ include/linux/rmap.h | 12 ++++++++++--
+ mm/memory.c          |  8 ++++----
+ mm/rmap.c            |  5 ++---
+ 3 files changed, 16 insertions(+), 9 deletions(-)
 
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index f14a73a8d420..d77a11067b94 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -685,13 +685,18 @@ void free_compound_page(struct page *page);
-  * pte_mkwrite.  But get_user_pages can cause write faults for mappings
-  * that do not have writing enabled, when used by access_process_vm.
-  */
--static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
- {
--	if (likely(vma->vm_flags & VM_WRITE))
-+	if (likely(vma_flags & VM_WRITE))
- 		pte = pte_mkwrite(pte);
- 	return pte;
+diff --git a/include/linux/rmap.h b/include/linux/rmap.h
+index 43ef2c30cb0f..f5cd4dbc78b0 100644
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -170,8 +170,16 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ 		unsigned long, bool);
+ void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
+ 			   unsigned long, int);
+-void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
+-		unsigned long, bool);
++void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
++			      unsigned long, bool);
++static inline void page_add_new_anon_rmap(struct page *page,
++					  struct vm_area_struct *vma,
++					  unsigned long address, bool compound)
++{
++	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
++	__page_add_new_anon_rmap(page, vma, address, compound);
++}
++
+ void page_add_file_rmap(struct page *, bool);
+ void page_remove_rmap(struct page *, bool);
+ 
+diff --git a/mm/memory.c b/mm/memory.c
+index 9f9e5bb7a556..51bc8315281e 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -2376,7 +2376,7 @@ static int wp_page_copy(struct vm_fault *vmf)
+ 		 * thread doing COW.
+ 		 */
+ 		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
+-		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
++		__page_add_new_anon_rmap(new_page, vma, vmf->address, false);
+ 		mem_cgroup_commit_charge(new_page, memcg, false, false);
+ 		__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
+ 		/*
+@@ -2847,7 +2847,7 @@ int do_swap_page(struct vm_fault *vmf)
+ 		mem_cgroup_commit_charge(page, memcg, true, false);
+ 		activate_page(page);
+ 	} else { /* ksm created a completely new copy */
+-		page_add_new_anon_rmap(page, vma, vmf->address, false);
++		__page_add_new_anon_rmap(page, vma, vmf->address, false);
+ 		mem_cgroup_commit_charge(page, memcg, false, false);
+ 		__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ 	}
+@@ -2985,7 +2985,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
+ 	}
+ 
+ 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+-	page_add_new_anon_rmap(page, vma, vmf->address, false);
++	__page_add_new_anon_rmap(page, vma, vmf->address, false);
+ 	mem_cgroup_commit_charge(page, memcg, false, false);
+ 	__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ setpte:
+@@ -3237,7 +3237,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
+ 	/* copy-on-write page */
+ 	if (write && !(vmf->vma_flags & VM_SHARED)) {
+ 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
+-		page_add_new_anon_rmap(page, vma, vmf->address, false);
++		__page_add_new_anon_rmap(page, vma, vmf->address, false);
+ 		mem_cgroup_commit_charge(page, memcg, false, false);
+ 		__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
+ 	} else {
+diff --git a/mm/rmap.c b/mm/rmap.c
+index c1286d47aa1f..0c9f8ded669a 100644
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -1122,7 +1122,7 @@ void do_page_add_anon_rmap(struct page *page,
  }
  
-+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-+{
-+	return __maybe_mkwrite(pte, vma->vm_flags);
-+}
-+
- int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- 		struct page *page);
- int finish_fault(struct vm_fault *vmf);
-diff --git a/mm/memory.c b/mm/memory.c
-index cbd7e5c3a42f..c24891d5676f 100644
---- a/mm/memory.c
-+++ b/mm/memory.c
-@@ -2438,7 +2438,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
+ /**
+- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
++ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
+  * @page:	the page to add the mapping to
+  * @vma:	the vm area in which the mapping is added
+  * @address:	the user virtual address mapped
+@@ -1132,12 +1132,11 @@ void do_page_add_anon_rmap(struct page *page,
+  * This means the inc-and-test can be bypassed.
+  * Page does not have to be locked.
+  */
+-void page_add_new_anon_rmap(struct page *page,
++void __page_add_new_anon_rmap(struct page *page,
+ 	struct vm_area_struct *vma, unsigned long address, bool compound)
+ {
+ 	int nr = compound ? hpage_nr_pages(page) : 1;
  
- 	flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- 	entry = pte_mkyoung(vmf->orig_pte);
--	entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-+	entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
- 	if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
- 		update_mmu_cache(vma, vmf->address, vmf->pte);
- 	pte_unmap_unlock(vmf->pte, vmf->ptl);
-@@ -2529,8 +2529,8 @@ static int wp_page_copy(struct vm_fault *vmf)
- 			inc_mm_counter_fast(mm, MM_ANONPAGES);
- 		}
- 		flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
--		entry = mk_pte(new_page, vma->vm_page_prot);
--		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-+		entry = mk_pte(new_page, vmf->vma_page_prot);
-+		entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
- 		/*
- 		 * Clear the pte entry and flush it first, before updating the
- 		 * pte with the new entry. This will avoid a race condition
+-	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+ 	__SetPageSwapBacked(page);
+ 	if (compound) {
+ 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 -- 
 2.7.4
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help