Thread (27 messages) 27 messages, 6 authors, 2021-07-27
STALE1802d
Revisions (4)
  1. v1 current
  2. v2 [diff vs current]
  3. v3 [diff vs current]
  4. v3 [diff vs current]

[PATCH 6/7] mm: defer freeing PTE page table for a grace period

From: Qi Zheng <hidden>
Date: 2021-07-18 04:31:26
Also in: linux-mm, lkml
Subsystem: control group - memory resource controller (memcg), filesystems (vfs and infrastructure), hmm - heterogeneous memory management, memory management, memory management - core, memory management - gup (get user pages), memory management - ksm (kernel samepage merging), memory management - memory policy and migration, memory management - rmap (reverse mapping), memory management - swap, memory management - thp (transparent huge page), memory mapping, memory mapping - madvise (memory advice), proc filesystem, the rest · Maintainers: Johannes Weiner, Michal Hocko, Roman Gushchin, Shakeel Butt, Alexander Viro, Christian Brauner, Jason Gunthorpe, Leon Romanovsky, Andrew Morton, David Hildenbrand, Lorenzo Stoakes, Chris Li, Kairui Song, Liam R. Howlett, Linus Torvalds

With rcu_read_lock(), the release of the PTE page table
can be postponed. So we don't need to hold the pmd lock
anymore when we do pte_try_get()/pte_alloc_get(), which
can improve performance and simplify code logic.

Signed-off-by: Qi Zheng <redacted>
---
 fs/proc/task_mmu.c      |  8 ++++----
 include/linux/pte_ref.h | 28 +++++++++++++++-------------
 mm/gup.c                |  2 +-
 mm/hmm.c                |  2 +-
 mm/khugepaged.c         |  4 ++--
 mm/ksm.c                |  2 +-
 mm/madvise.c            |  6 +++---
 mm/memcontrol.c         |  4 ++--
 mm/memory.c             | 14 +++++++-------
 mm/mempolicy.c          |  2 +-
 mm/migrate.c            |  2 +-
 mm/mincore.c            |  2 +-
 mm/mprotect.c           |  2 +-
 mm/page_vma_mapped.c    |  2 +-
 mm/pagewalk.c           |  2 +-
 mm/pte_ref.c            | 10 +++++++++-
 mm/swapfile.c           |  2 +-
 17 files changed, 52 insertions(+), 42 deletions(-)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b3cf4b8a91d6..f3c9c984bc29 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -584,7 +584,7 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	}
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
-	    !pte_try_get(vma->vm_mm, pmd))
+	    !pte_try_get(pmd))
 		goto out;
 	/*
 	 * The mmap_lock held all the way back in m_start() is what
@@ -1148,7 +1148,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 	}
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
-	    !pte_try_get(vma->vm_mm, pmd))
+	    !pte_try_get(pmd))
 		return 0;
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -1482,7 +1482,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		return 0;
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
 
-	if (!pte_try_get(walk->mm, pmdp))
+	if (!pte_try_get(pmdp))
 		return 0;
 
 	/*
@@ -1824,7 +1824,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 	if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
 		return 0;
 #endif
-	if (!pte_try_get(walk->mm, pmd))
+	if (!pte_try_get(pmd))
 		return 0;
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
 	do {
diff --git a/include/linux/pte_ref.h b/include/linux/pte_ref.h
index 695fbe8b991b..f4d20faab684 100644
--- a/include/linux/pte_ref.h
+++ b/include/linux/pte_ref.h
@@ -74,16 +74,17 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
  * i_mmap_lock or when parallel threads are excluded by other means
  * which can make @pmdp entry stable.
  */
-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
 {
 	bool retval = true;
-	spinlock_t *ptl;
+	pmd_t pmdval;
 
-	ptl = pmd_lock(mm, pmdp);
-	if (pmd_leaf(*pmdp) || !pmd_present(*pmdp) ||
-	    !pte_get_unless_zero(pmdp))
+	rcu_read_lock();
+	pmdval = READ_ONCE(*pmdp);
+	if (pmd_leaf(pmdval) || !pmd_present(pmdval) ||
+	    !pte_get_unless_zero(&pmdval))
 		retval = false;
-	spin_unlock(ptl);
+	rcu_read_unlock();
 
 	return retval;
 }
@@ -129,21 +130,22 @@ static inline void pte_put_vmf(struct vm_fault *vmf)
 
 static inline int pte_alloc_try_get(struct mm_struct *mm, pmd_t *pmdp)
 {
-	if (!pte_try_get(mm, pmdp))
+	if (!pte_try_get(pmdp))
 		return __pte_alloc_try_get(mm, pmdp);
 	return 1;
 }
 
 static inline int pte_alloc_get(struct mm_struct *mm, pmd_t *pmdp)
 {
-	spinlock_t *ptl;
+	pmd_t pmdval;
 
-	ptl = pmd_lock(mm, pmdp);
-	if (pmd_none(*pmdp) || !pte_get_unless_zero(pmdp)) {
-		spin_unlock(ptl);
+	rcu_read_lock();
+	pmdval = READ_ONCE(*pmdp);
+	if (pmd_none(pmdval) || !pte_get_unless_zero(&pmdval)) {
+		rcu_read_unlock();
 		return __pte_alloc_get(mm, pmdp);
 	}
-	spin_unlock(ptl);
+	rcu_read_unlock();
 	return 0;
 }
 #else
@@ -173,7 +175,7 @@ static inline bool pte_get_unless_zero(pmd_t *pmdp)
 	return true;
 }
 
-static inline bool pte_try_get(struct mm_struct *mm, pmd_t *pmdp)
+static inline bool pte_try_get(pmd_t *pmdp)
 {
 	return true;
 }
diff --git a/mm/gup.c b/mm/gup.c
index 3e2a153cb18e..a5be18e349cd 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -503,7 +503,7 @@ static struct page *follow_page_pte(struct vm_area_struct *vma,
 	if (unlikely(pmd_bad(*pmd)))
 		return no_page_table(vma, flags);
 
-	if (!pte_try_get(mm, pmd))
+	if (!pte_try_get(pmd))
 		return no_page_table(vma, flags);
 
 	ptep = pte_offset_map_lock(mm, pmd, address, &ptl);
diff --git a/mm/hmm.c b/mm/hmm.c
index 29bb379510cc..d0e767c5fbb6 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -380,7 +380,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
 		return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
 	}
 
-	if (!pte_try_get(walk->mm, pmdp))
+	if (!pte_try_get(pmdp))
 		goto again;
 
 	ptep = pte_offset_map(pmdp, addr);
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index e6c4d1b7a12a..c653edd75345 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -1240,7 +1240,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		goto out;
 	}
 
-	if (!pte_try_get(mm, pmd)) {
+	if (!pte_try_get(pmd)) {
 		result = SCAN_PMD_NULL;
 		goto out;
 	}
@@ -1469,7 +1469,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
 	if (!pmd)
 		goto drop_hpage;
 
-	if (!pte_try_get(mm, pmd))
+	if (!pte_try_get(pmd))
 		goto drop_hpage;
 	start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
 
diff --git a/mm/ksm.c b/mm/ksm.c
index 2e106f58dad0..5671683890c0 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1133,7 +1133,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
 	if (!pmd)
 		goto out;
 
-	if (!pte_try_get(mm, pmd))
+	if (!pte_try_get(pmd))
 		goto out;
 
 	mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
diff --git a/mm/madvise.c b/mm/madvise.c
index 4c4b35292212..0e849bbf268b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -193,7 +193,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
 	    pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
-	    !pte_try_get(vma->vm_mm, pmd))
+	    !pte_try_get(pmd))
 		return 0;
 
 	for (index = start; index != end; index += PAGE_SIZE) {
@@ -396,7 +396,7 @@ static int madvise_cold_or_pageout_pte_range(pmd_t *pmd,
 	if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
 		return 0;
 #endif
-	if (!pte_try_get(vma->vm_mm, pmd))
+	if (!pte_try_get(pmd))
 		return 0;
 	tlb_change_page_size(tlb, PAGE_SIZE);
 	orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -596,7 +596,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
 			goto next;
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
-	     pmd_trans_unstable(pmd)) || !pte_try_get(mm, pmd))
+	     pmd_trans_unstable(pmd)) || !pte_try_get(pmd))
 		return 0;
 	nr_put++;
 
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 4f19e5f2cd18..f8c1cabdd259 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -5835,7 +5835,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
 	}
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
-	    !pte_try_get(vma->vm_mm, pmd))
+	    !pte_try_get(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; pte++, addr += PAGE_SIZE)
@@ -6058,7 +6058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
 	if (!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd))
 		return 0;
 retry:
-	if (!pte_try_get(vma->vm_mm, pmd))
+	if (!pte_try_get(pmd))
 		return 0;
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	for (; addr != end; addr += PAGE_SIZE) {
diff --git a/mm/memory.c b/mm/memory.c
index 242ed135bde4..c8ee0074c730 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1143,7 +1143,7 @@ copy_pmd_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma,
 		if (pmd_none_or_clear_bad(src_pmd))
 			continue;
 
-		if (!pte_try_get(src_mm, src_pmd))
+		if (!pte_try_get(src_pmd))
 			goto retry;
 		if (copy_pte_range(dst_vma, src_vma, dst_pmd, src_pmd,
 				   addr, next)) {
@@ -1481,7 +1481,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
 		 */
 		if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
 		     pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
-		     !pte_try_get(tlb->mm, pmd))
+		     !pte_try_get(pmd))
 			goto next;
 
 		next = zap_pte_range(tlb, vma, pmd, addr, next, details);
@@ -2608,7 +2608,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
 				continue;
 			pmd_clear_bad(pmd);
 		}
-		if (!create && !pte_try_get(mm, pmd))
+		if (!create && !pte_try_get(pmd))
 			goto retry;
 		err = apply_to_pte_range(mm, pmd, addr, next,
 					 fn, data, create, mask);
@@ -4078,7 +4078,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
 		}
 	} else if (pmd_devmap_trans_unstable(vmf->pmd)) { /* See comment in handle_pte_fault() */
 		return 0;
-	} else if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+	} else if (!pte_try_get(vmf->pmd)) {
 		goto retry;
 	}
 
@@ -4319,7 +4319,7 @@ static vm_fault_t do_fault(struct vm_fault *vmf)
 			ret = VM_FAULT_SIGBUS;
 			goto out;
 		} else {
-			if (!pte_try_get(vma->vm_mm, vmf->pmd)) {
+			if (!pte_try_get(vmf->pmd)) {
 				ret = VM_FAULT_SIGBUS;
 				goto out;
 			}
@@ -4579,7 +4579,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
 		if (pmd_devmap_trans_unstable(vmf->pmd))
 			return 0;
 
-		if (!pte_try_get(vmf->vma->vm_mm, vmf->pmd))
+		if (!pte_try_get(vmf->pmd))
 			goto retry;
 
 		if (IS_ENABLED(CONFIG_FREE_USER_PTE))
@@ -5000,7 +5000,7 @@ int follow_invalidate_pte(struct mm_struct *mm, unsigned long address,
 					(address & PAGE_MASK) + PAGE_SIZE);
 		mmu_notifier_invalidate_range_start(range);
 	}
-	if (!pte_try_get(mm, pmd))
+	if (!pte_try_get(pmd))
 		goto out;
 	ptep = pte_offset_map_lock(mm, pmd, address, ptlp);
 	if (!pte_present(*ptep))
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index cbb3640717ff..b19243d8fe56 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -520,7 +520,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
 	/* THP was split, fall through to pte walk */
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
-	    !pte_try_get(walk->mm, pmd))
+	    !pte_try_get(pmd))
 		return 0;
 
 	mapped_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
diff --git a/mm/migrate.c b/mm/migrate.c
index 6a94e8558b2c..e1a2169ab9e9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -2265,7 +2265,7 @@ static int migrate_vma_collect_pmd(pmd_t *pmdp,
 	if (unlikely(pmd_bad(*pmdp)))
 		return migrate_vma_collect_skip(start, end, walk);
 
-	if (!pte_try_get(mm, pmdp))
+	if (!pte_try_get(pmdp))
 		goto again;
 	ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
 	arch_enter_lazy_mmu_mode();
diff --git a/mm/mincore.c b/mm/mincore.c
index e21e271a7657..76eb31aaeef9 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -115,7 +115,7 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
 	}
 
 	if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) && pmd_trans_unstable(pmd)) ||
-	    !pte_try_get(walk->mm, pmd)) {
+	    !pte_try_get(pmd)) {
 		__mincore_unmapped_range(addr, end, vma, vec);
 		goto out;
 	}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 9cbd0848c5c5..8b387f8386c4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -319,7 +319,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
 			}
 			/* fall through, the trans huge pmd just split */
 		}
-		if (!pte_try_get(vma->vm_mm, pmd))
+		if (!pte_try_get(pmd))
 			goto retry;
 		this_pages = change_pte_range(vma, pmd, addr, next, newprot,
 					      cp_flags);
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index eb84fa5825c0..c49bbff7aa60 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -259,7 +259,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw)
 			step_forward(pvmw, PMD_SIZE);
 			continue;
 		}
-		if (!pte_try_get(pvmw->vma->vm_mm, pvmw->pmd))
+		if (!pte_try_get(pvmw->pmd))
 			goto retry;
 		if (!map_pte(pvmw))
 			goto next_pte;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 4080a88d7852..c7439a2e85f7 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -152,7 +152,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
 			err = walk_hugepd_range((hugepd_t *)pmd, addr, next, walk, PMD_SHIFT);
 		} else {
 			if (!walk->no_vma) {
-				if (!pte_try_get(walk->mm, pmd))
+				if (!pte_try_get(pmd))
 					goto again;
 				err = walk_pte_range(pmd, addr, next, walk);
 				pte_put(walk->mm, pmd, addr);
diff --git a/mm/pte_ref.c b/mm/pte_ref.c
index 1b8d9828d513..7fd3d687a9cd 100644
--- a/mm/pte_ref.c
+++ b/mm/pte_ref.c
@@ -26,6 +26,14 @@ static inline void pte_free_debug(pmd_t pmd)
 }
 #endif
 
+static void pte_free_rcu(struct rcu_head *rcu)
+{
+	struct page *page = container_of(rcu, struct page, rcu_head);
+
+	pgtable_pte_page_dtor(page);
+	__free_page(page);
+}
+
 void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
 {
 	struct vm_area_struct vma = TLB_FLUSH_VMA(mm, 0);
@@ -39,7 +47,7 @@ void free_pte_table(struct mm_struct *mm, pmd_t *pmdp, unsigned long addr)
 	pte_free_debug(pmd);
 	flush_tlb_range(&vma, addr, addr + PMD_SIZE);
 	mm_dec_nr_ptes(mm);
-	pte_free(mm, pmd_pgtable(pmd));
+	call_rcu(&pmd_pgtable(pmd)->rcu_head, pte_free_rcu);
 }
 
 static inline void __pte_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte)
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 6153283be500..fe6f7c6d2849 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2024,7 +2024,7 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 		next = pmd_addr_end(addr, end);
 		if ((!IS_ENABLED(CONFIG_FREE_USER_PTE) &&
 		    pmd_none_or_trans_huge_or_clear_bad(pmd)) ||
-		    !pte_try_get(vma->vm_mm, pmd))
+		    !pte_try_get(pmd))
 			continue;
 		ret = unuse_pte_range(vma, pmd, addr, next, type,
 				      frontswap, fs_pages_to_unuse);
-- 
2.11.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help