Inter-revision diff: patch 1

Comparing v8 (message) to v7 (message)

--- v8
+++ v7
@@ -1,75 +1,154 @@
-Architectures like ppc64 support faster mremap only with radix
-translation. Hence allow a runtime check w.r.t support for fast mremap.
+CPU 1				CPU 2					CPU 3
 
+mremap(old_addr, new_addr)      page_shrinker/try_to_unmap_one
+
+mmap_write_lock_killable()
+
+				addr = old_addr
+				lock(pte_ptl)
+lock(pmd_ptl)
+pmd = *old_pmd
+pmd_clear(old_pmd)
+flush_tlb_range(old_addr)
+
+*new_pmd = pmd
+									*new_addr = 10; and fills
+									TLB with new addr
+									and old pfn
+
+unlock(pmd_ptl)
+				ptep_clear_flush()
+				old pfn is free.
+									Stale TLB entry
+
+Fix this race by holding pmd lock in pageout. This still doesn't handle the race
+between MOVE_PUD and pageout.
+
+Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
+Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
 Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
 ---
- arch/powerpc/include/asm/tlb.h |  6 ++++++
- mm/mremap.c                    | 15 ++++++++++++++-
- 2 files changed, 20 insertions(+), 1 deletion(-)
+ include/linux/rmap.h |  9 ++++++---
+ mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
+ 2 files changed, 24 insertions(+), 21 deletions(-)
 
-diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
-index 160422a439aa..09a9ae5f3656 100644
---- a/arch/powerpc/include/asm/tlb.h
-+++ b/arch/powerpc/include/asm/tlb.h
-@@ -83,5 +83,11 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
- }
- #endif
+diff --git a/include/linux/rmap.h b/include/linux/rmap.h
+index def5c62c93b3..272ab0c2b60b 100644
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
+ 	unsigned long address;
+ 	pmd_t *pmd;
+ 	pte_t *pte;
+-	spinlock_t *ptl;
++	spinlock_t *pte_ptl;
++	spinlock_t *pmd_ptl;
+ 	unsigned int flags;
+ };
  
-+#define arch_supports_page_table_move arch_supports_page_table_move
-+static inline bool arch_supports_page_table_move(void)
-+{
-+	return radix_enabled();
-+}
-+
- #endif /* __KERNEL__ */
- #endif /* __ASM_POWERPC_TLB_H */
-diff --git a/mm/mremap.c b/mm/mremap.c
-index c3cad539a7aa..ca9d345f22e8 100644
---- a/mm/mremap.c
-+++ b/mm/mremap.c
-@@ -25,7 +25,7 @@
- #include <linux/userfaultfd_k.h>
- 
- #include <asm/cacheflush.h>
--#include <asm/tlbflush.h>
-+#include <asm/tlb.h>
- #include <asm/pgalloc.h>
- 
- #include "internal.h"
-@@ -210,6 +210,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- 		drop_rmap_locks(vma);
+@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
+ 	/* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
+ 	if (pvmw->pte && !PageHuge(pvmw->page))
+ 		pte_unmap(pvmw->pte);
+-	if (pvmw->ptl)
+-		spin_unlock(pvmw->ptl);
++	if (pvmw->pte_ptl)
++		spin_unlock(pvmw->pte_ptl);
++	if (pvmw->pmd_ptl)
++		spin_unlock(pvmw->pmd_ptl);
  }
  
-+#ifndef arch_supports_page_table_move
-+#define arch_supports_page_table_move arch_supports_page_table_move
-+static inline bool arch_supports_page_table_move(void)
-+{
-+	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
-+		IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
-+}
-+#endif
+ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
+diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
+index 2cf01d933f13..87a2c94c7e27 100644
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
+ 				return false;
+ 		}
+ 	}
+-	pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+-	spin_lock(pvmw->ptl);
++	if (USE_SPLIT_PTE_PTLOCKS) {
++		pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
++		spin_lock(pvmw->pte_ptl);
++	}
+ 	return true;
+ }
+ 
+@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ 		if (!pvmw->pte)
+ 			return false;
+ 
+-		pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+-		spin_lock(pvmw->ptl);
++		pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
++		spin_lock(pvmw->pte_ptl);
+ 		if (!check_pte(pvmw))
+ 			return not_found(pvmw);
+ 		return true;
+@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ 	if (!pud_present(*pud))
+ 		return false;
+ 	pvmw->pmd = pmd_offset(pud, pvmw->address);
++	pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
+ 	/*
+ 	 * Make sure the pmd value isn't cached in a register by the
+ 	 * compiler and used as a stale value after we've observed a
+@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ 	 */
+ 	pmde = READ_ONCE(*pvmw->pmd);
+ 	if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+-		pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ 		if (likely(pmd_trans_huge(*pvmw->pmd))) {
+ 			if (pvmw->flags & PVMW_MIGRATION)
+ 				return not_found(pvmw);
+@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ 				}
+ 			}
+ 			return not_found(pvmw);
+-		} else {
+-			/* THP pmd was split under us: handle on pte level */
+-			spin_unlock(pvmw->ptl);
+-			pvmw->ptl = NULL;
+ 		}
+-	} else if (!pmd_present(pmde)) {
+-		return false;
+-	}
++	} else if (!pmd_present(pmde))
++		return not_found(pvmw);
 +
- #ifdef CONFIG_HAVE_MOVE_PMD
- static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- 		  unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
-@@ -218,6 +227,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- 	struct mm_struct *mm = vma->vm_mm;
- 	pmd_t pmd;
+ 	if (!map_pte(pvmw))
+ 		goto next_pte;
+ 	while (1) {
+@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ 			/* Did we cross page table boundary? */
+ 			if (pvmw->address % PMD_SIZE == 0) {
+ 				pte_unmap(pvmw->pte);
+-				if (pvmw->ptl) {
+-					spin_unlock(pvmw->ptl);
+-					pvmw->ptl = NULL;
++				if (pvmw->pte_ptl) {
++					spin_unlock(pvmw->pte_ptl);
++					pvmw->pte_ptl = NULL;
+ 				}
++				spin_unlock(pvmw->pmd_ptl);
++				pvmw->pmd_ptl = NULL;
+ 				goto restart;
+ 			} else {
+ 				pvmw->pte++;
+ 			}
+ 		} while (pte_none(*pvmw->pte));
  
-+	if (!arch_supports_page_table_move())
-+		return false;
- 	/*
- 	 * The destination pmd shouldn't be established, free_pgtables()
- 	 * should have released it.
-@@ -284,6 +295,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- 	struct mm_struct *mm = vma->vm_mm;
- 	pud_t pud;
- 
-+	if (!arch_supports_page_table_move())
-+		return false;
- 	/*
- 	 * The destination pud shouldn't be established, free_pgtables()
- 	 * should have released it.
+-		if (!pvmw->ptl) {
+-			pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+-			spin_lock(pvmw->ptl);
++		if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
++			pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
++			spin_lock(pvmw->pte_ptl);
+ 		}
+ 	}
+ }
 -- 
 2.31.1
 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help