--- v7
+++ v2
@@ -1,69 +1,429 @@
-The current maybe_mkwrite() is getting passed the pointer to the vma
-structure to fetch the vm_flags field.
-
-When dealing with the speculative page fault handler, it will be better to
-rely on the cached vm_flags value stored in the vm_fault structure.
-
-This patch introduce a __maybe_mkwrite() service which can be called by
-passing the value of the vm_flags field.
-
-There is no change functional changes expected for the other callers of
-maybe_mkwrite().
-
+From: Peter Zijlstra <peterz@infradead.org>
+
+Provide infrastructure to do a speculative fault (not holding
+mmap_sem).
+
+The not holding of mmap_sem means we can race against VMA
+change/removal and page-table destruction. We use the SRCU VMA freeing
+to keep the VMA around. We use the VMA seqcount to detect change
+(including umapping / page-table deletion) and we use gup_fast() style
+page-table walking to deal with page-table races.
+
+Once we've obtained the page and are ready to update the PTE, we
+validate if the state we started the fault with is still valid, if
+not, we'll fail the fault with VM_FAULT_RETRY, otherwise we update the
+PTE and we're done.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+
+[Manage the newly introduced pte_spinlock() for speculative page
+ fault to fail if the VMA is touched in our back]
+[Rename vma_is_dead() to vma_has_changed() and declare it here]
+[Call p4d_alloc() as it is safe since pgd is valid]
+[Call pud_alloc() as it is safe since p4d is valid]
+[Set fe.sequence in __handle_mm_fault()]
+[Abort speculative path when handle_userfault() has to be called]
+[Add additional VMA's flags checks in handle_speculative_fault()]
+[Clear FAULT_FLAG_ALLOW_RETRY in handle_speculative_fault()]
+[Don't set vmf->pte and vmf->ptl if pte_map_lock() failed]
+[Remove warning comment about waiting for !seq&1 since we don't want
+ to wait]
+[Remove warning about no huge page support, mention it explictly]
+[Don't call do_fault() in the speculative path as __do_fault() calls
+ vma->vm_ops->fault() which may want to release mmap_sem]
+[Only vm_fault pointer argument for vma_has_changed()]
+[Fix check against huge page, calling pmd_trans_huge()]
+[Introduce __HAVE_ARCH_CALL_SPF to declare the SPF handler only when
+ architecture is supporting it]
+[Use READ_ONCE() when reading VMA's fields in the speculative path]
+[Explicitly check for __HAVE_ARCH_PTE_SPECIAL as we can't support for
+ processing done in vm_normal_page()]
+[Check that vma->anon_vma is already set when starting the speculative
+ path]
+[Check for memory policy as we can't support MPOL_INTERLEAVE case due to
+ the processing done in mpol_misplaced()]
+[Don't support VMA growing up or down]
+[Move check on vm_sequence just before calling handle_pte_fault()]
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
- include/linux/mm.h | 9 +++++++--
- mm/memory.c | 6 +++---
- 2 files changed, 10 insertions(+), 5 deletions(-)
-
+ include/linux/hugetlb_inline.h | 2 +-
+ include/linux/mm.h | 5 +
+ include/linux/pagemap.h | 4 +-
+ mm/internal.h | 14 +++
+ mm/memory.c | 237 ++++++++++++++++++++++++++++++++++++++++-
+ 5 files changed, 254 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/hugetlb_inline.h b/include/linux/hugetlb_inline.h
+index a4e7ca0f3585..6cfdfca4cc2a 100644
+--- a/include/linux/hugetlb_inline.h
++++ b/include/linux/hugetlb_inline.h
+@@ -7,7 +7,7 @@
+
+ static inline bool is_vm_hugetlb_page(struct vm_area_struct *vma)
+ {
+- return !!(vma->vm_flags & VM_HUGETLB);
++ return !!(READ_ONCE(vma->vm_flags) & VM_HUGETLB);
+ }
+
+ #else
diff --git a/include/linux/mm.h b/include/linux/mm.h
-index c034f478b73d..4976ea1a8c90 100644
+index 0f4ddd72b172..0fe0811d304f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
-@@ -685,13 +685,18 @@ void free_compound_page(struct page *page);
- * pte_mkwrite. But get_user_pages can cause write faults for mappings
- * that do not have writing enabled, when used by access_process_vm.
- */
--static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
-+static inline pte_t __maybe_mkwrite(pte_t pte, unsigned long vma_flags)
- {
-- if (likely(vma->vm_flags & VM_WRITE))
-+ if (likely(vma_flags & VM_WRITE))
- pte = pte_mkwrite(pte);
- return pte;
+@@ -315,6 +315,7 @@ struct vm_fault {
+ gfp_t gfp_mask; /* gfp mask to be used for allocations */
+ pgoff_t pgoff; /* Logical page offset based on vma */
+ unsigned long address; /* Faulting virtual address */
++ unsigned int sequence;
+ pmd_t *pmd; /* Pointer to pmd entry matching
+ * the 'address' */
+ pud_t *pud; /* Pointer to pud entry matching
+@@ -1297,6 +1298,10 @@ int invalidate_inode_page(struct page *page);
+ #ifdef CONFIG_MMU
+ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ unsigned int flags);
++#ifdef __HAVE_ARCH_CALL_SPF
++extern int handle_speculative_fault(struct mm_struct *mm,
++ unsigned long address, unsigned int flags);
++#endif /* __HAVE_ARCH_CALL_SPF */
+ extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long address, unsigned int fault_flags,
+ bool *unlocked);
+diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
+index 79b36f57c3ba..3a9735dfa6b6 100644
+--- a/include/linux/pagemap.h
++++ b/include/linux/pagemap.h
+@@ -443,8 +443,8 @@ static inline pgoff_t linear_page_index(struct vm_area_struct *vma,
+ pgoff_t pgoff;
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ return linear_hugepage_index(vma, address);
+- pgoff = (address - vma->vm_start) >> PAGE_SHIFT;
+- pgoff += vma->vm_pgoff;
++ pgoff = (address - READ_ONCE(vma->vm_start)) >> PAGE_SHIFT;
++ pgoff += READ_ONCE(vma->vm_pgoff);
+ return pgoff;
}
-+static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
+diff --git a/mm/internal.h b/mm/internal.h
+index 736540f15936..9d6347e35747 100644
+--- a/mm/internal.h
++++ b/mm/internal.h
+@@ -45,6 +45,20 @@ extern struct srcu_struct vma_srcu;
+ extern struct vm_area_struct *find_vma_srcu(struct mm_struct *mm,
+ unsigned long addr);
+
++static inline bool vma_has_changed(struct vm_fault *vmf)
+{
-+ return __maybe_mkwrite(pte, vma->vm_flags);
++ int ret = RB_EMPTY_NODE(&vmf->vma->vm_rb);
++ unsigned seq = ACCESS_ONCE(vmf->vma->vm_sequence.sequence);
++
++ /*
++ * Matches both the wmb in write_seqlock_{begin,end}() and
++ * the wmb in vma_rb_erase().
++ */
++ smp_rmb();
++
++ return ret || seq != vmf->sequence;
+}
+
- int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page);
- int finish_fault(struct vm_fault *vmf);
+ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
+ unsigned long floor, unsigned long ceiling);
+
diff --git a/mm/memory.c b/mm/memory.c
-index 2e4c5755cbc1..eb84429be9ea 100644
+index 51bc8315281e..0ba14a5797b2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
-@@ -2452,7 +2452,7 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
-
- flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
- entry = pte_mkyoung(vmf->orig_pte);
-- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-+ entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
- if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
- update_mmu_cache(vma, vmf->address, vmf->pte);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
-@@ -2543,8 +2543,8 @@ static int wp_page_copy(struct vm_fault *vmf)
- inc_mm_counter_fast(mm, MM_ANONPAGES);
- }
- flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
-- entry = mk_pte(new_page, vma->vm_page_prot);
-- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-+ entry = mk_pte(new_page, vmf->vma_page_prot);
-+ entry = __maybe_mkwrite(pte_mkdirty(entry), vmf->vma_flags);
- /*
- * Clear the pte entry and flush it first, before updating the
- * pte with the new entry. This will avoid a race condition
+@@ -760,7 +760,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
+ if (page)
+ dump_page(page, "bad pte");
+ pr_alert("addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
+- (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
++ (void *)addr, READ_ONCE(vma->vm_flags), vma->anon_vma,
++ mapping, index);
+ /*
+ * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
+ */
+@@ -2285,15 +2286,69 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
+
+ static bool pte_spinlock(struct vm_fault *vmf)
+ {
++ bool ret = false;
++
++ /* Check if vma is still valid */
++ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
++ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
++ spin_lock(vmf->ptl);
++ return true;
++ }
++
++ local_irq_disable();
++ if (vma_has_changed(vmf))
++ goto out;
++
+ vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
+ spin_lock(vmf->ptl);
+- return true;
++
++ if (vma_has_changed(vmf)) {
++ spin_unlock(vmf->ptl);
++ goto out;
++ }
++
++ ret = true;
++out:
++ local_irq_enable();
++ return ret;
+ }
+
+ static bool pte_map_lock(struct vm_fault *vmf)
+ {
+- vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl);
+- return true;
++ bool ret = false;
++ pte_t *pte;
++ spinlock_t *ptl;
++
++ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE)) {
++ vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
++ vmf->address, &vmf->ptl);
++ return true;
++ }
++
++ /*
++ * The first vma_has_changed() guarantees the page-tables are still
++ * valid, having IRQs disabled ensures they stay around, hence the
++ * second vma_has_changed() to make sure they are still valid once
++ * we've got the lock. After that a concurrent zap_pte_range() will
++ * block on the PTL and thus we're safe.
++ */
++ local_irq_disable();
++ if (vma_has_changed(vmf))
++ goto out;
++
++ pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd,
++ vmf->address, &ptl);
++ if (vma_has_changed(vmf)) {
++ pte_unmap_unlock(pte, ptl);
++ goto out;
++ }
++
++ vmf->pte = pte;
++ vmf->ptl = ptl;
++ ret = true;
++out:
++ local_irq_enable();
++ return ret;
+ }
+
+ /*
+@@ -2939,6 +2994,14 @@ static int do_anonymous_page(struct vm_fault *vmf)
+ return VM_FAULT_RETRY;
+ if (!pte_none(*vmf->pte))
+ goto unlock;
++ /*
++ * Don't call the userfaultfd during the speculative path.
++ * We already checked for the VMA to not be managed through
++ * userfaultfd, but it may be set in our back once we have lock
++ * the pte. In such a case we can ignore it this time.
++ */
++ if (vmf->flags & FAULT_FLAG_SPECULATIVE)
++ goto setpte;
+ /* Deliver the page fault to userland, check inside PT lock */
+ if (userfaultfd_missing(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+@@ -2977,7 +3040,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
+ goto release;
+
+ /* Deliver the page fault to userland, check inside PT lock */
+- if (userfaultfd_missing(vma)) {
++ if (!(vmf->flags & FAULT_FLAG_SPECULATIVE) && userfaultfd_missing(vma)) {
+ pte_unmap_unlock(vmf->pte, vmf->ptl);
+ mem_cgroup_cancel_charge(page, memcg, false);
+ put_page(page);
+@@ -3748,6 +3811,8 @@ static int handle_pte_fault(struct vm_fault *vmf)
+ if (!vmf->pte) {
+ if (vma_is_anonymous(vmf->vma))
+ return do_anonymous_page(vmf);
++ else if (vmf->flags & FAULT_FLAG_SPECULATIVE)
++ return VM_FAULT_RETRY;
+ else
+ return do_fault(vmf);
+ }
+@@ -3845,6 +3910,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ vmf.pmd = pmd_alloc(mm, vmf.pud, address);
+ if (!vmf.pmd)
+ return VM_FAULT_OOM;
++ vmf.sequence = raw_read_seqcount(&vma->vm_sequence);
+ if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) {
+ ret = create_huge_pmd(&vmf);
+ if (!(ret & VM_FAULT_FALLBACK))
+@@ -3872,6 +3938,167 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
+ return handle_pte_fault(&vmf);
+ }
+
++#ifdef __HAVE_ARCH_CALL_SPF
++
++#ifndef __HAVE_ARCH_PTE_SPECIAL
++/* This is required by vm_normal_page() */
++#error "Speculative page fault handler requires __HAVE_ARCH_PTE_SPECIAL"
++#endif
++
++/*
++ * vm_normal_page() adds some processing which should be done while
++ * hodling the mmap_sem.
++ */
++int handle_speculative_fault(struct mm_struct *mm, unsigned long address,
++ unsigned int flags)
++{
++ struct vm_fault vmf = {
++ .address = address,
++ };
++ pgd_t *pgd;
++ p4d_t *p4d;
++ pud_t *pud;
++ pmd_t *pmd;
++ int dead, seq, idx, ret = VM_FAULT_RETRY;
++ struct vm_area_struct *vma;
++ struct mempolicy *pol;
++
++ /* Clear flags that may lead to release the mmap_sem to retry */
++ flags &= ~(FAULT_FLAG_ALLOW_RETRY|FAULT_FLAG_KILLABLE);
++ flags |= FAULT_FLAG_SPECULATIVE;
++
++ idx = srcu_read_lock(&vma_srcu);
++ vma = find_vma_srcu(mm, address);
++ if (!vma)
++ goto unlock;
++
++ /*
++ * Validate the VMA found by the lockless lookup.
++ */
++ dead = RB_EMPTY_NODE(&vma->vm_rb);
++ seq = raw_read_seqcount(&vma->vm_sequence); /* rmb <-> seqlock,vma_rb_erase() */
++ if ((seq & 1) || dead)
++ goto unlock;
++
++ /*
++ * Can't call vm_ops service has we don't know what they would do
++ * with the VMA.
++ * This include huge page from hugetlbfs.
++ */
++ if (vma->vm_ops)
++ goto unlock;
++
++ if (unlikely(!vma->anon_vma))
++ goto unlock;
++
++ vmf.vma_flags = READ_ONCE(vma->vm_flags);
++ vmf.vma_page_prot = READ_ONCE(vma->vm_page_prot);
++
++ /* Can't call userland page fault handler in the speculative path */
++ if (unlikely(vmf.vma_flags & VM_UFFD_MISSING))
++ goto unlock;
++
++ /*
++ * MPOL_INTERLEAVE implies additional check in mpol_misplaced() which
++ * are not compatible with the speculative page fault processing.
++ */
++ pol = __get_vma_policy(vma, address);
++ if (!pol)
++ pol = get_task_policy(current);
++ if (pol && pol->mode == MPOL_INTERLEAVE)
++ goto unlock;
++
++ if (vmf.vma_flags & VM_GROWSDOWN || vmf.vma_flags & VM_GROWSUP)
++ /*
++ * This could be detected by the check address against VMA's
++ * boundaries but we want to trace it as not supported instead
++ * of changed.
++ */
++ goto unlock;
++
++ if (address < READ_ONCE(vma->vm_start)
++ || READ_ONCE(vma->vm_end) <= address)
++ goto unlock;
++
++ /*
++ * The three following checks are copied from access_error from
++ * arch/x86/mm/fault.c
++ */
++ if (!arch_vma_access_permitted(vma, flags & FAULT_FLAG_WRITE,
++ flags & FAULT_FLAG_INSTRUCTION,
++ flags & FAULT_FLAG_REMOTE))
++ goto unlock;
++
++ /* This is one is required to check that the VMA has write access set */
++ if (flags & FAULT_FLAG_WRITE) {
++ if (unlikely(!(vmf.vma_flags & VM_WRITE)))
++ goto unlock;
++ } else {
++ if (unlikely(!(vmf.vma_flags & (VM_READ | VM_EXEC | VM_WRITE))))
++ goto unlock;
++ }
++
++ /*
++ * Do a speculative lookup of the PTE entry.
++ */
++ local_irq_disable();
++ pgd = pgd_offset(mm, address);
++ if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
++ goto out_walk;
++
++ p4d = p4d_alloc(mm, pgd, address);
++ if (p4d_none(*p4d) || unlikely(p4d_bad(*p4d)))
++ goto out_walk;
++
++ pud = pud_alloc(mm, p4d, address);
++ if (pud_none(*pud) || unlikely(pud_bad(*pud)))
++ goto out_walk;
++
++ pmd = pmd_offset(pud, address);
++ if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd)))
++ goto out_walk;
++
++ /*
++ * The above does not allocate/instantiate page-tables because doing so
++ * would lead to the possibility of instantiating page-tables after
++ * free_pgtables() -- and consequently leaking them.
++ *
++ * The result is that we take at least one !speculative fault per PMD
++ * in order to instantiate it.
++ */
++
++ /* Transparent huge pages are not supported. */
++ if (unlikely(pmd_trans_huge(*pmd)))
++ goto out_walk;
++
++ vmf.vma = vma;
++ vmf.pmd = pmd;
++ vmf.pgoff = linear_page_index(vma, address);
++ vmf.gfp_mask = __get_fault_gfp_mask(vma);
++ vmf.sequence = seq;
++ vmf.flags = flags;
++
++ local_irq_enable();
++
++ /*
++ * We need to re-validate the VMA after checking the bounds, otherwise
++ * we might have a false positive on the bounds.
++ */
++ if (read_seqcount_retry(&vma->vm_sequence, seq))
++ goto unlock;
++
++ ret = handle_pte_fault(&vmf);
++
++unlock:
++ srcu_read_unlock(&vma_srcu, idx);
++ return ret;
++
++out_walk:
++ local_irq_enable();
++ goto unlock;
++}
++#endif /* __HAVE_ARCH_CALL_SPF */
++
+ /*
+ * By the time we get here, we already hold the mm semaphore
+ *
--
2.7.4