Inter-revision diff: patch 15

Comparing v5 (message) to v4 (message)

--- v5
+++ v4
@@ -1,132 +1,87 @@
-When dealing with speculative page fault handler, we may race with VMA
-being split or merged. In this case the vma->vm_start and vm->vm_end
-fields may not match the address the page fault is occurring.
+There is a deadlock when a CPU is doing a speculative page fault and
+another one is calling do_unmap().
 
-This can only happens when the VMA is split but in that case, the
-anon_vma pointer of the new VMA will be the same as the original one,
-because in __split_vma the new->anon_vma is set to src->anon_vma when
-*new = *vma.
+The deadlock occurred because the speculative path try to spinlock the
+pte while the interrupt are disabled. When the other CPU in the
+unmap's path has locked the pte then is waiting for all the CPU to
+invalidate the TLB. As the CPU doing the speculative fault have the
+interrupt disable it can't invalidate the TLB, and can't get the lock.
 
-So even if the VMA boundaries are not correct, the anon_vma pointer is
-still valid.
+Since we are in a speculative path, we can race with other mm action.
+So let assume that the lock may not get acquired and fail the
+speculative page fault.
 
-If the VMA has been merged, then the VMA in which it has been merged
-must have the same anon_vma pointer otherwise the merge can't be done.
+Here are the stacks captured during the deadlock:
 
-So in all the case we know that the anon_vma is valid, since we have
-checked before starting the speculative page fault that the anon_vma
-pointer is valid for this VMA and since there is an anon_vma this
-means that at one time a page has been backed and that before the VMA
-is cleaned, the page table lock would have to be grab to clean the
-PTE, and the anon_vma field is checked once the PTE is locked.
+	CPU 0
+	native_flush_tlb_others+0x7c/0x260
+	flush_tlb_mm_range+0x6a/0x220
+	tlb_flush_mmu_tlbonly+0x63/0xc0
+	unmap_page_range+0x897/0x9d0
+	? unmap_single_vma+0x7d/0xe0
+	? release_pages+0x2b3/0x360
+	unmap_single_vma+0x7d/0xe0
+	unmap_vmas+0x51/0xa0
+	unmap_region+0xbd/0x130
+	do_munmap+0x279/0x460
+	SyS_munmap+0x53/0x70
 
-This patch introduce a new __page_add_new_anon_rmap() service which
-doesn't check for the VMA boundaries, and create a new inline one
-which do the check.
-
-When called from a page fault handler, if this is not a speculative one,
-there is a guarantee that vm_start and vm_end match the faulting address,
-so this check is useless. In the context of the speculative page fault
-handler, this check may be wrong but anon_vma is still valid as explained
-above.
+	CPU 1
+	do_raw_spin_lock+0x14e/0x160
+	_raw_spin_lock+0x5d/0x80
+	? pte_map_lock+0x169/0x1b0
+	pte_map_lock+0x169/0x1b0
+	handle_pte_fault+0xbf2/0xd80
+	? trace_hardirqs_on+0xd/0x10
+	handle_speculative_fault+0x272/0x280
+	handle_speculative_fault+0x5/0x280
+	__do_page_fault+0x187/0x580
+	trace_do_page_fault+0x52/0x260
+	do_async_page_fault+0x19/0x70
+	async_page_fault+0x28/0x30
 
 Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
 ---
- include/linux/rmap.h | 12 ++++++++++--
- mm/memory.c          |  8 ++++----
- mm/rmap.c            |  5 ++---
- 3 files changed, 16 insertions(+), 9 deletions(-)
+ mm/memory.c | 19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
 
-diff --git a/include/linux/rmap.h b/include/linux/rmap.h
-index 733d3d8181e2..d91be69c1c60 100644
---- a/include/linux/rmap.h
-+++ b/include/linux/rmap.h
-@@ -173,8 +173,16 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *,
- 		unsigned long, bool);
- void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
- 			   unsigned long, int);
--void page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
--		unsigned long, bool);
-+void __page_add_new_anon_rmap(struct page *, struct vm_area_struct *,
-+			      unsigned long, bool);
-+static inline void page_add_new_anon_rmap(struct page *page,
-+					  struct vm_area_struct *vma,
-+					  unsigned long address, bool compound)
-+{
-+	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
-+	__page_add_new_anon_rmap(page, vma, address, compound);
-+}
-+
- void page_add_file_rmap(struct page *, bool);
- void page_remove_rmap(struct page *, bool);
- 
 diff --git a/mm/memory.c b/mm/memory.c
-index 4ad4f0a6f652..3705ff3e04d5 100644
+index 6761e3007500..8abfc0e12e25 100644
 --- a/mm/memory.c
 +++ b/mm/memory.c
-@@ -2551,7 +2551,7 @@ static int wp_page_copy(struct vm_fault *vmf)
- 		 * thread doing COW.
- 		 */
- 		ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
--		page_add_new_anon_rmap(new_page, vma, vmf->address, false);
-+		__page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- 		mem_cgroup_commit_charge(new_page, memcg, false, false);
- 		__lru_cache_add_active_or_unevictable(new_page, vmf->vma_flags);
- 		/*
-@@ -3065,7 +3065,7 @@ int do_swap_page(struct vm_fault *vmf)
+@@ -2476,7 +2476,8 @@ static bool pte_spinlock(struct vm_fault *vmf)
+ 		goto out;
  
- 	/* ksm created a completely new copy */
- 	if (unlikely(page != swapcache && swapcache)) {
--		page_add_new_anon_rmap(page, vma, vmf->address, false);
-+		__page_add_new_anon_rmap(page, vma, vmf->address, false);
- 		mem_cgroup_commit_charge(page, memcg, false, false);
- 		__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
- 	} else {
-@@ -3215,7 +3215,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
- 	}
+ 	vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+-	spin_lock(vmf->ptl);
++	if (unlikely(!spin_trylock(vmf->ptl)))
++		goto out;
  
- 	inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
--	page_add_new_anon_rmap(page, vma, vmf->address, false);
-+	__page_add_new_anon_rmap(page, vma, vmf->address, false);
- 	mem_cgroup_commit_charge(page, memcg, false, false);
- 	__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
- setpte:
-@@ -3467,7 +3467,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- 	/* copy-on-write page */
- 	if (write && !(vmf->vma_flags & VM_SHARED)) {
- 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
--		page_add_new_anon_rmap(page, vma, vmf->address, false);
-+		__page_add_new_anon_rmap(page, vma, vmf->address, false);
- 		mem_cgroup_commit_charge(page, memcg, false, false);
- 		__lru_cache_add_active_or_unevictable(page, vmf->vma_flags);
- 	} else {
-diff --git a/mm/rmap.c b/mm/rmap.c
-index 787c07fb37dc..357ea765e795 100644
---- a/mm/rmap.c
-+++ b/mm/rmap.c
-@@ -1138,7 +1138,7 @@ void do_page_add_anon_rmap(struct page *page,
- }
+ 	if (vma_has_changed(vmf)) {
+ 		spin_unlock(vmf->ptl);
+@@ -2521,8 +2522,20 @@ static bool pte_map_lock(struct vm_fault *vmf)
+ 	if (vma_has_changed(vmf))
+ 		goto out;
  
- /**
-- * page_add_new_anon_rmap - add pte mapping to a new anonymous page
-+ * __page_add_new_anon_rmap - add pte mapping to a new anonymous page
-  * @page:	the page to add the mapping to
-  * @vma:	the vm area in which the mapping is added
-  * @address:	the user virtual address mapped
-@@ -1148,12 +1148,11 @@ void do_page_add_anon_rmap(struct page *page,
-  * This means the inc-and-test can be bypassed.
-  * Page does not have to be locked.
-  */
--void page_add_new_anon_rmap(struct page *page,
-+void __page_add_new_anon_rmap(struct page *page,
- 	struct vm_area_struct *vma, unsigned long address, bool compound)
- {
- 	int nr = compound ? hpage_nr_pages(page) : 1;
- 
--	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
- 	__SetPageSwapBacked(page);
- 	if (compound) {
- 		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+-	pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
+-				  vmf->address, &ptl);
++	/*
++	 * Same as pte_offset_map_lock() except that we call
++	 * spin_trylock() in place of spin_lock() to avoid race with
++	 * unmap path which may have the lock and wait for this CPU
++	 * to invalidate TLB but this CPU has irq disabled.
++	 * Since we are in a speculative patch, accept it could fail
++	 */
++	ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
++	pte = pte_offset_map(vmf->pmd, vmf->address);
++	if (unlikely(!spin_trylock(ptl))) {
++		pte_unmap(pte);
++		goto out;
++	}
++
+ 	if (vma_has_changed(vmf)) {
+ 		pte_unmap_unlock(pte, ptl);
+ 		goto out;
 -- 
 2.7.4
 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help