Re: [PATCH 1/2] mm/mmu_notifier: avoid double notification when it is useless v2

From: Balbir Singh <bsingharora@gmail.com>
Date: 2017-10-19 03:04:41
Also in: linux-iommu, linux-mm, linux-next, lkml

On Mon, 16 Oct 2017 23:10:02 -0400
jglisse@redhat.com wrote:

quoted hunk ↗ jump to hunk

From: J=C3=A9r=C3=B4me Glisse <redacted>
=20
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		if (pmdp) {
 #ifdef CONFIG_FS_DAX_PMD
 			pmd_t pmd;

@@ -628,7 +635,6 @@ static void dax_mapping_entry_mkclean(struct address_=

space *mapping,

 			pmd =3D pmd_wrprotect(pmd);
 			pmd =3D pmd_mkclean(pmd);
 			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);

Could the secondary TLB still see the mapping as dirty and propagate the di=
rty bit back?

quoted hunk ↗ jump to hunk

 unlock_pmd:
 			spin_unlock(ptl);
 #endif

@@ -643,7 +649,6 @@ static void dax_mapping_entry_mkclean(struct address_=

space *mapping,

 			pte =3D pte_wrprotect(pte);
 			pte =3D pte_mkclean(pte);
 			set_pte_at(vma->vm_mm, address, ptep, pte);
-			mmu_notifier_invalidate_range(vma->vm_mm, start, end);

Ditto

quoted hunk ↗ jump to hunk

 unlock_pte:
 			pte_unmap_unlock(ptep, ptl);
 		}

diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h
index 6866e8126982..49c925c96b8a 100644
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h

@@ -155,7 +155,8 @@ struct mmu_notifier_ops {
 	 * shared page-tables, it not necessary to implement the
 	 * invalidate_range_start()/end() notifiers, as
 	 * invalidate_range() alread catches the points in time when an
-	 * external TLB range needs to be flushed.
+	 * external TLB range needs to be flushed. For more in depth
+	 * discussion on this see Documentation/vm/mmu_notifier.txt
 	 *
 	 * The invalidate_range() function is called under the ptl
 	 * spin-lock and not allowed to sleep.

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c037d3d34950..ff5bc647b51d 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c

@@ -1186,8 +1186,15 @@ static int do_huge_pmd_wp_page_fallback(struct vm_=

fault *vmf, pmd_t orig_pmd,

quoted hunk ↗ jump to hunk

 		goto out_free_pages;
 	VM_BUG_ON_PAGE(!PageHead(page), page);
=20
+	/*
+	 * Leave pmd empty until pte is filled note we must notify here as
+	 * concurrent CPU thread might write to new page before the call to
+	 * mmu_notifier_invalidate_range_end() happens which can lead to a
+	 * device seeing memory write in different order than CPU.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
 	pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-	/* leave pmd empty until pte is filled */
=20
 	pgtable =3D pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
 	pmd_populate(vma->vm_mm, &_pmd, pgtable);

@@ -2026,8 +2033,15 @@ static void __split_huge_zero_page_pmd(struct vm_a=

rea_struct *vma,

 	pmd_t _pmd;
 	int i;
=20
-	/* leave pmd empty until pte is filled */
-	pmdp_huge_clear_flush_notify(vma, haddr, pmd);
+	/*
+	 * Leave pmd empty until pte is filled note that it is fine to delay
+	 * notification until mmu_notifier_invalidate_range_end() as we are
+	 * replacing a zero pmd write protected page with a zero pte write
+	 * protected page.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
+	pmdp_huge_clear_flush(vma, haddr, pmd);

Shouldn't the secondary TLB know if the page size changed?

quoted hunk ↗ jump to hunk

=20
 	pgtable =3D pgtable_trans_huge_withdraw(mm, pmd);
 	pmd_populate(mm, &_pmd, pgtable);

diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 1768efa4c501..63a63f1b536c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c

@@ -3254,9 +3254,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst,=

 struct mm_struct *src,

 			set_huge_swap_pte_at(dst, addr, dst_pte, entry, sz);
 		} else {
 			if (cow) {
+				/*
+				 * No need to notify as we are downgrading page
+				 * table protection not changing it to point
+				 * to a new page.
+				 *
+				 * See Documentation/vm/mmu_notifier.txt
+				 */
 				huge_ptep_set_wrprotect(src, addr, src_pte);

OK.. so we could get write faults on write accesses from the device.

quoted hunk ↗ jump to hunk

-				mmu_notifier_invalidate_range(src, mmun_start,
-								   mmun_end);
 			}
 			entry =3D huge_ptep_get(src_pte);
 			ptepage =3D pte_page(entry);

@@ -4288,7 +4293,12 @@ unsigned long hugetlb_change_protection(struct vm_=

area_struct *vma,

quoted hunk ↗ jump to hunk

 	 * and that page table be reused and filled with junk.
 	 */
 	flush_hugetlb_tlb_range(vma, start, end);
-	mmu_notifier_invalidate_range(mm, start, end);
+	/*
+	 * No need to call mmu_notifier_invalidate_range() we are downgrading
+	 * page table protection not changing it to point to a new page.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
 	i_mmap_unlock_write(vma->vm_file->f_mapping);
 	mmu_notifier_invalidate_range_end(mm, start, end);
=20

diff --git a/mm/ksm.c b/mm/ksm.c
index 6cb60f46cce5..be8f4576f842 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c

@@ -1052,8 +1052,13 @@ static int write_protect_page(struct vm_area_struc=

t *vma, struct page *page,

quoted hunk ↗ jump to hunk

 		 * So we clear the pte and flush the tlb before the check
 		 * this assure us that no O_DIRECT can happen after the check
 		 * or in the middle of the check.
+		 *
+		 * No need to notify as we are downgrading page table to read
+		 * only not changing it to point to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
 		 */
-		entry =3D ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte);
+		entry =3D ptep_clear_flush(vma, pvmw.address, pvmw.pte);
 		/*
 		 * Check that no O_DIRECT or similar I/O is in progress on the
 		 * page

@@ -1136,7 +1141,13 @@ static int replace_page(struct vm_area_struct *vma=

, struct page *page,

quoted hunk ↗ jump to hunk

 	}
=20
 	flush_cache_page(vma, addr, pte_pfn(*ptep));
-	ptep_clear_flush_notify(vma, addr, ptep);
+	/*
+	 * No need to notify as we are replacing a read only page with another
+	 * read only page with the same content.
+	 *
+	 * See Documentation/vm/mmu_notifier.txt
+	 */
+	ptep_clear_flush(vma, addr, ptep);
 	set_pte_at_notify(mm, addr, ptep, newpte);
=20
 	page_remove_rmap(page, false);

diff --git a/mm/rmap.c b/mm/rmap.c
index 061826278520..6b5a0f219ac0 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c

@@ -937,10 +937,15 @@ static bool page_mkclean_one(struct page *page, str=

uct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 #endif
 		}
=20
-		if (ret) {
-			mmu_notifier_invalidate_range(vma->vm_mm, cstart, cend);
+		/*
+		 * No need to call mmu_notifier_invalidate_range() as we are
+		 * downgrading page table protection not changing it to point
+		 * to a new page.
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
+		if (ret)
 			(*cleaned)++;
-		}
 	}
=20
 	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);

@@ -1424,6 +1429,10 @@ static bool try_to_unmap_one(struct page *page, st=

ruct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 			if (pte_soft_dirty(pteval))
 				swp_pte =3D pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 */
 			goto discard;
 		}
=20

@@ -1481,6 +1490,9 @@ static bool try_to_unmap_one(struct page *page, str=

uct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 			 * will take care of the rest.
 			 */
 			dec_mm_counter(mm, mm_counter(page));
+			/* We have to invalidate as we cleared the pte */
+			mmu_notifier_invalidate_range(mm, address,
+						      address + PAGE_SIZE);
 		} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 				(flags & (TTU_MIGRATION|TTU_SPLIT_FREEZE))) {
 			swp_entry_t entry;

@@ -1496,6 +1508,10 @@ static bool try_to_unmap_one(struct page *page, st=

ruct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 			if (pte_soft_dirty(pteval))
 				swp_pte =3D pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
+			/*
+			 * No need to invalidate here it will synchronize on
+			 * against the special swap migration pte.
+			 */
 		} else if (PageAnon(page)) {
 			swp_entry_t entry =3D { .val =3D page_private(subpage) };
 			pte_t swp_pte;

@@ -1507,6 +1523,8 @@ static bool try_to_unmap_one(struct page *page, str=

uct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 				WARN_ON_ONCE(1);
 				ret =3D false;
 				/* We have to invalidate as we cleared the pte */
+				mmu_notifier_invalidate_range(mm, address,
+							address + PAGE_SIZE);
 				page_vma_mapped_walk_done(&pvmw);
 				break;
 			}

@@ -1514,6 +1532,9 @@ static bool try_to_unmap_one(struct page *page, str=

uct vm_area_struct *vma,

quoted hunk ↗ jump to hunk

 			/* MADV_FREE page check */
 			if (!PageSwapBacked(page)) {
 				if (!PageDirty(page)) {
+					/* Invalidate as we cleared the pte */
+					mmu_notifier_invalidate_range(mm,
+						address, address + PAGE_SIZE);
 					dec_mm_counter(mm, MM_ANONPAGES);
 					goto discard;
 				}

@@ -1547,13 +1568,39 @@ static bool try_to_unmap_one(struct page *page, s=

truct vm_area_struct *vma,

 			if (pte_soft_dirty(pteval))
 				swp_pte =3D pte_swp_mksoft_dirty(swp_pte);
 			set_pte_at(mm, address, pvmw.pte, swp_pte);
-		} else
+			/* Invalidate as we cleared the pte */
+			mmu_notifier_invalidate_range(mm, address,
+						      address + PAGE_SIZE);
+		} else {
+			/*
+			 * We should not need to notify here as we reach this
+			 * case only from freeze_page() itself only call from
+			 * split_huge_page_to_list() so everything below must
+			 * be true:
+			 *   - page is not anonymous
+			 *   - page is locked
+			 *
+			 * So as it is a locked file back page thus it can not
+			 * be remove from the page cache and replace by a new
+			 * page before mmu_notifier_invalidate_range_end so no
+			 * concurrent thread might update its page table to
+			 * point at new page while a device still is using this
+			 * page.
+			 *
+			 * See Documentation/vm/mmu_notifier.txt
+			 */
 			dec_mm_counter(mm, mm_counter_file(page));
+		}
 discard:
+		/*
+		 * No need to call mmu_notifier_invalidate_range() it has be
+		 * done above for all cases requiring it to happen under page
+		 * table lock before mmu_notifier_invalidate_range_end()
+		 *
+		 * See Documentation/vm/mmu_notifier.txt
+		 */
 		page_remove_rmap(subpage, PageHuge(page));
 		put_page(page);
-		mmu_notifier_invalidate_range(mm, address,
-					      address + PAGE_SIZE);
 	}
=20
 	mmu_notifier_invalidate_range_end(vma->vm_mm, start, end);

Looking at the patchset, I understand the efficiency, but I am concerned
with correctness.

Balbir Singh.

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help