--- v8
+++ v7
@@ -1,75 +1,154 @@
-Architectures like ppc64 support faster mremap only with radix
-translation. Hence allow a runtime check w.r.t support for fast mremap.
+CPU 1 CPU 2 CPU 3
+mremap(old_addr, new_addr) page_shrinker/try_to_unmap_one
+
+mmap_write_lock_killable()
+
+ addr = old_addr
+ lock(pte_ptl)
+lock(pmd_ptl)
+pmd = *old_pmd
+pmd_clear(old_pmd)
+flush_tlb_range(old_addr)
+
+*new_pmd = pmd
+ *new_addr = 10; and fills
+ TLB with new addr
+ and old pfn
+
+unlock(pmd_ptl)
+ ptep_clear_flush()
+ old pfn is free.
+ Stale TLB entry
+
+Fix this race by holding pmd lock in pageout. This still doesn't handle the race
+between MOVE_PUD and pageout.
+
+Fixes: 2c91bd4a4e2e ("mm: speed up mremap by 20x on large regions")
+Link: https://lore.kernel.org/linux-mm/CAHk-=wgXVR04eBNtxQfevontWnP6FDm+oj5vauQXP3S-huwbPw@mail.gmail.com
Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
---
- arch/powerpc/include/asm/tlb.h | 6 ++++++
- mm/mremap.c | 15 ++++++++++++++-
- 2 files changed, 20 insertions(+), 1 deletion(-)
+ include/linux/rmap.h | 9 ++++++---
+ mm/page_vma_mapped.c | 36 ++++++++++++++++++------------------
+ 2 files changed, 24 insertions(+), 21 deletions(-)
-diff --git a/arch/powerpc/include/asm/tlb.h b/arch/powerpc/include/asm/tlb.h
-index 160422a439aa..09a9ae5f3656 100644
---- a/arch/powerpc/include/asm/tlb.h
-+++ b/arch/powerpc/include/asm/tlb.h
-@@ -83,5 +83,11 @@ static inline int mm_is_thread_local(struct mm_struct *mm)
- }
- #endif
+diff --git a/include/linux/rmap.h b/include/linux/rmap.h
+index def5c62c93b3..272ab0c2b60b 100644
+--- a/include/linux/rmap.h
++++ b/include/linux/rmap.h
+@@ -207,7 +207,8 @@ struct page_vma_mapped_walk {
+ unsigned long address;
+ pmd_t *pmd;
+ pte_t *pte;
+- spinlock_t *ptl;
++ spinlock_t *pte_ptl;
++ spinlock_t *pmd_ptl;
+ unsigned int flags;
+ };
-+#define arch_supports_page_table_move arch_supports_page_table_move
-+static inline bool arch_supports_page_table_move(void)
-+{
-+ return radix_enabled();
-+}
-+
- #endif /* __KERNEL__ */
- #endif /* __ASM_POWERPC_TLB_H */
-diff --git a/mm/mremap.c b/mm/mremap.c
-index c3cad539a7aa..ca9d345f22e8 100644
---- a/mm/mremap.c
-+++ b/mm/mremap.c
-@@ -25,7 +25,7 @@
- #include <linux/userfaultfd_k.h>
-
- #include <asm/cacheflush.h>
--#include <asm/tlbflush.h>
-+#include <asm/tlb.h>
- #include <asm/pgalloc.h>
-
- #include "internal.h"
-@@ -210,6 +210,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
- drop_rmap_locks(vma);
+@@ -216,8 +217,10 @@ static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw)
+ /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */
+ if (pvmw->pte && !PageHuge(pvmw->page))
+ pte_unmap(pvmw->pte);
+- if (pvmw->ptl)
+- spin_unlock(pvmw->ptl);
++ if (pvmw->pte_ptl)
++ spin_unlock(pvmw->pte_ptl);
++ if (pvmw->pmd_ptl)
++ spin_unlock(pvmw->pmd_ptl);
}
-+#ifndef arch_supports_page_table_move
-+#define arch_supports_page_table_move arch_supports_page_table_move
-+static inline bool arch_supports_page_table_move(void)
-+{
-+ return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) ||
-+ IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
-+}
-+#endif
+ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw);
+diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
+index 2cf01d933f13..87a2c94c7e27 100644
+--- a/mm/page_vma_mapped.c
++++ b/mm/page_vma_mapped.c
+@@ -47,8 +47,10 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw)
+ return false;
+ }
+ }
+- pvmw->ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
+- spin_lock(pvmw->ptl);
++ if (USE_SPLIT_PTE_PTLOCKS) {
++ pvmw->pte_ptl = pte_lockptr(pvmw->vma->vm_mm, pvmw->pmd);
++ spin_lock(pvmw->pte_ptl);
++ }
+ return true;
+ }
+
+@@ -162,8 +164,8 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ if (!pvmw->pte)
+ return false;
+
+- pvmw->ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
+- spin_lock(pvmw->ptl);
++ pvmw->pte_ptl = huge_pte_lockptr(page_hstate(page), mm, pvmw->pte);
++ spin_lock(pvmw->pte_ptl);
+ if (!check_pte(pvmw))
+ return not_found(pvmw);
+ return true;
+@@ -179,6 +181,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ if (!pud_present(*pud))
+ return false;
+ pvmw->pmd = pmd_offset(pud, pvmw->address);
++ pvmw->pmd_ptl = pmd_lock(mm, pvmw->pmd);
+ /*
+ * Make sure the pmd value isn't cached in a register by the
+ * compiler and used as a stale value after we've observed a
+@@ -186,7 +189,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ */
+ pmde = READ_ONCE(*pvmw->pmd);
+ if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) {
+- pvmw->ptl = pmd_lock(mm, pvmw->pmd);
+ if (likely(pmd_trans_huge(*pvmw->pmd))) {
+ if (pvmw->flags & PVMW_MIGRATION)
+ return not_found(pvmw);
+@@ -206,14 +208,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ }
+ }
+ return not_found(pvmw);
+- } else {
+- /* THP pmd was split under us: handle on pte level */
+- spin_unlock(pvmw->ptl);
+- pvmw->ptl = NULL;
+ }
+- } else if (!pmd_present(pmde)) {
+- return false;
+- }
++ } else if (!pmd_present(pmde))
++ return not_found(pvmw);
+
- #ifdef CONFIG_HAVE_MOVE_PMD
- static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
-@@ -218,6 +227,8 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- struct mm_struct *mm = vma->vm_mm;
- pmd_t pmd;
+ if (!map_pte(pvmw))
+ goto next_pte;
+ while (1) {
+@@ -233,19 +231,21 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
+ /* Did we cross page table boundary? */
+ if (pvmw->address % PMD_SIZE == 0) {
+ pte_unmap(pvmw->pte);
+- if (pvmw->ptl) {
+- spin_unlock(pvmw->ptl);
+- pvmw->ptl = NULL;
++ if (pvmw->pte_ptl) {
++ spin_unlock(pvmw->pte_ptl);
++ pvmw->pte_ptl = NULL;
+ }
++ spin_unlock(pvmw->pmd_ptl);
++ pvmw->pmd_ptl = NULL;
+ goto restart;
+ } else {
+ pvmw->pte++;
+ }
+ } while (pte_none(*pvmw->pte));
-+ if (!arch_supports_page_table_move())
-+ return false;
- /*
- * The destination pmd shouldn't be established, free_pgtables()
- * should have released it.
-@@ -284,6 +295,8 @@ static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
- struct mm_struct *mm = vma->vm_mm;
- pud_t pud;
-
-+ if (!arch_supports_page_table_move())
-+ return false;
- /*
- * The destination pud shouldn't be established, free_pgtables()
- * should have released it.
+- if (!pvmw->ptl) {
+- pvmw->ptl = pte_lockptr(mm, pvmw->pmd);
+- spin_lock(pvmw->ptl);
++ if (USE_SPLIT_PTE_PTLOCKS && !pvmw->pte_ptl) {
++ pvmw->pte_ptl = pte_lockptr(mm, pvmw->pmd);
++ spin_lock(pvmw->pte_ptl);
+ }
+ }
+ }
--
2.31.1