--- v10
+++ v2
@@ -1,376 +1,27 @@
-This change is inspired by the Peter's proposal patch [1] which was
-protecting the VMA using SRCU. Unfortunately, SRCU is not scaling well in
-that particular case, and it is introducing major performance degradation
-due to excessive scheduling operations.
+Add a new software event to count succeeded speculative page faults.
-To allow access to the mm_rb tree without grabbing the mmap_sem, this patch
-is protecting it access using a rwlock. As the mm_rb tree is a O(log n)
-search it is safe to protect it using such a lock. The VMA cache is not
-protected by the new rwlock and it should not be used without holding the
-mmap_sem.
-
-To allow the picked VMA structure to be used once the rwlock is released, a
-use count is added to the VMA structure. When the VMA is allocated it is
-set to 1. Each time the VMA is picked with the rwlock held its use count
-is incremented. Each time the VMA is released it is decremented. When the
-use count hits zero, this means that the VMA is no more used and should be
-freed.
-
-This patch is preparing for 2 kind of VMA access :
- - as usual, under the control of the mmap_sem,
- - without holding the mmap_sem for the speculative page fault handler.
-
-Access done under the control the mmap_sem doesn't require to grab the
-rwlock to protect read access to the mm_rb tree, but access in write must
-be done under the protection of the rwlock too. This affects inserting and
-removing of elements in the RB tree.
-
-The patch is introducing 2 new functions:
- - vma_get() to find a VMA based on an address by holding the new rwlock.
- - vma_put() to release the VMA when its no more used.
-These services are designed to be used when access are made to the RB tree
-without holding the mmap_sem.
-
-When a VMA is removed from the RB tree, its vma->vm_rb field is cleared and
-we rely on the WMB done when releasing the rwlock to serialize the write
-with the RMB done in a later patch to check for the VMA's validity.
-
-When free_vma is called, the file associated with the VMA is closed
-immediately, but the policy and the file structure remained in used until
-the VMA's use count reach 0, which may happens later when exiting an
-in progress speculative page fault.
-
-[1] https://patchwork.kernel.org/patch/5108281/
-
-Cc: Peter Zijlstra (Intel) <peterz@infradead.org>
-Cc: Matthew Wilcox <willy@infradead.org>
Signed-off-by: Laurent Dufour <ldufour@linux.vnet.ibm.com>
---
- include/linux/mm.h | 1 +
- include/linux/mm_types.h | 4 ++
- kernel/fork.c | 3 ++
- mm/init-mm.c | 3 ++
- mm/internal.h | 6 +++
- mm/mmap.c | 115 +++++++++++++++++++++++++++++++++++------------
- 6 files changed, 104 insertions(+), 28 deletions(-)
+ include/uapi/linux/perf_event.h | 1 +
+ 1 file changed, 1 insertion(+)
-diff --git a/include/linux/mm.h b/include/linux/mm.h
-index f967bf84094f..e2c24ea58d94 100644
---- a/include/linux/mm.h
-+++ b/include/linux/mm.h
-@@ -1272,6 +1272,7 @@ static inline void INIT_VMA(struct vm_area_struct *vma)
- INIT_LIST_HEAD(&vma->anon_vma_chain);
- #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
- seqcount_init(&vma->vm_sequence);
-+ atomic_set(&vma->vm_ref_count, 1);
- #endif
- }
+diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
+index b1c0b187acfe..3043ec0988e9 100644
+--- a/include/uapi/linux/perf_event.h
++++ b/include/uapi/linux/perf_event.h
+@@ -111,6 +111,7 @@ enum perf_sw_ids {
+ PERF_COUNT_SW_EMULATION_FAULTS = 8,
+ PERF_COUNT_SW_DUMMY = 9,
+ PERF_COUNT_SW_BPF_OUTPUT = 10,
++ PERF_COUNT_SW_SPF_DONE = 11,
-diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
-index db5e9d630e7a..faf3844dd815 100644
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -337,6 +337,7 @@ struct vm_area_struct {
- struct vm_userfaultfd_ctx vm_userfaultfd_ctx;
- #ifdef CONFIG_SPECULATIVE_PAGE_FAULT
- seqcount_t vm_sequence;
-+ atomic_t vm_ref_count; /* see vma_get(), vma_put() */
- #endif
- } __randomize_layout;
-
-@@ -355,6 +356,9 @@ struct kioctx_table;
- struct mm_struct {
- struct vm_area_struct *mmap; /* list of VMAs */
- struct rb_root mm_rb;
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+ rwlock_t mm_rb_lock;
-+#endif
- u32 vmacache_seqnum; /* per-thread vmacache */
- #ifdef CONFIG_MMU
- unsigned long (*get_unmapped_area) (struct file *filp,
-diff --git a/kernel/fork.c b/kernel/fork.c
-index d937e5945f77..9f8d235a3df8 100644
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -891,6 +891,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
- mm->mmap = NULL;
- mm->mm_rb = RB_ROOT;
- mm->vmacache_seqnum = 0;
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+ rwlock_init(&mm->mm_rb_lock);
-+#endif
- atomic_set(&mm->mm_users, 1);
- atomic_set(&mm->mm_count, 1);
- init_rwsem(&mm->mmap_sem);
-diff --git a/mm/init-mm.c b/mm/init-mm.c
-index f94d5d15ebc0..e71ac37a98c4 100644
---- a/mm/init-mm.c
-+++ b/mm/init-mm.c
-@@ -17,6 +17,9 @@
-
- struct mm_struct init_mm = {
- .mm_rb = RB_ROOT,
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+ .mm_rb_lock = __RW_LOCK_UNLOCKED(init_mm.mm_rb_lock),
-+#endif
- .pgd = swapper_pg_dir,
- .mm_users = ATOMIC_INIT(2),
- .mm_count = ATOMIC_INIT(1),
-diff --git a/mm/internal.h b/mm/internal.h
-index 62d8c34e63d5..fb2667b20f0a 100644
---- a/mm/internal.h
-+++ b/mm/internal.h
-@@ -40,6 +40,12 @@ void page_writeback_init(void);
-
- int do_swap_page(struct vm_fault *vmf);
-
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+extern struct vm_area_struct *get_vma(struct mm_struct *mm,
-+ unsigned long addr);
-+extern void put_vma(struct vm_area_struct *vma);
-+#endif
-+
- void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma,
- unsigned long floor, unsigned long ceiling);
-
-diff --git a/mm/mmap.c b/mm/mmap.c
-index 5601f1ef8bb9..a82950960f2e 100644
---- a/mm/mmap.c
-+++ b/mm/mmap.c
-@@ -160,6 +160,27 @@ void unlink_file_vma(struct vm_area_struct *vma)
- }
- }
-
-+static void __free_vma(struct vm_area_struct *vma)
-+{
-+ if (vma->vm_file)
-+ fput(vma->vm_file);
-+ mpol_put(vma_policy(vma));
-+ kmem_cache_free(vm_area_cachep, vma);
-+}
-+
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+void put_vma(struct vm_area_struct *vma)
-+{
-+ if (atomic_dec_and_test(&vma->vm_ref_count))
-+ __free_vma(vma);
-+}
-+#else
-+static inline void put_vma(struct vm_area_struct *vma)
-+{
-+ return __free_vma(vma);
-+}
-+#endif
-+
- /*
- * Close a vm structure and free it, returning the next.
- */
-@@ -170,10 +191,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
- might_sleep();
- if (vma->vm_ops && vma->vm_ops->close)
- vma->vm_ops->close(vma);
-- if (vma->vm_file)
-- fput(vma->vm_file);
-- mpol_put(vma_policy(vma));
-- kmem_cache_free(vm_area_cachep, vma);
-+ put_vma(vma);
- return next;
- }
-
-@@ -393,6 +411,14 @@ static void validate_mm(struct mm_struct *mm)
- #define validate_mm(mm) do { } while (0)
- #endif
-
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+#define mm_rb_write_lock(mm) write_lock(&(mm)->mm_rb_lock)
-+#define mm_rb_write_unlock(mm) write_unlock(&(mm)->mm_rb_lock)
-+#else
-+#define mm_rb_write_lock(mm) do { } while (0)
-+#define mm_rb_write_unlock(mm) do { } while (0)
-+#endif /* CONFIG_SPECULATIVE_PAGE_FAULT */
-+
- RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb,
- unsigned long, rb_subtree_gap, vma_compute_subtree_gap)
-
-@@ -411,26 +437,37 @@ static void vma_gap_update(struct vm_area_struct *vma)
- }
-
- static inline void vma_rb_insert(struct vm_area_struct *vma,
-- struct rb_root *root)
-+ struct mm_struct *mm)
- {
-+ struct rb_root *root = &mm->mm_rb;
-+
- /* All rb_subtree_gap values must be consistent prior to insertion */
- validate_mm_rb(root, NULL);
-
- rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
- }
-
--static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root)
-+static void __vma_rb_erase(struct vm_area_struct *vma, struct mm_struct *mm)
- {
-+ struct rb_root *root = &mm->mm_rb;
- /*
- * Note rb_erase_augmented is a fairly large inline function,
- * so make sure we instantiate it only once with our desired
- * augmented rbtree callbacks.
- */
-+ mm_rb_write_lock(mm);
- rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks);
-+ mm_rb_write_unlock(mm); /* wmb */
-+
-+ /*
-+ * Ensure the removal is complete before clearing the node.
-+ * Matched by vma_has_changed()/handle_speculative_fault().
-+ */
-+ RB_CLEAR_NODE(&vma->vm_rb);
- }
-
- static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
-- struct rb_root *root,
-+ struct mm_struct *mm,
- struct vm_area_struct *ignore)
- {
- /*
-@@ -438,21 +475,21 @@ static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma,
- * with the possible exception of the "next" vma being erased if
- * next->vm_start was reduced.
- */
-- validate_mm_rb(root, ignore);
-+ validate_mm_rb(&mm->mm_rb, ignore);
-
-- __vma_rb_erase(vma, root);
-+ __vma_rb_erase(vma, mm);
- }
-
- static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
-- struct rb_root *root)
-+ struct mm_struct *mm)
- {
- /*
- * All rb_subtree_gap values must be consistent prior to erase,
- * with the possible exception of the vma being erased.
- */
-- validate_mm_rb(root, vma);
-+ validate_mm_rb(&mm->mm_rb, vma);
-
-- __vma_rb_erase(vma, root);
-+ __vma_rb_erase(vma, mm);
- }
-
- /*
-@@ -567,10 +604,12 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
- * immediately update the gap to the correct value. Finally we
- * rebalance the rbtree after all augmented values have been set.
- */
-+ mm_rb_write_lock(mm);
- rb_link_node(&vma->vm_rb, rb_parent, rb_link);
- vma->rb_subtree_gap = 0;
- vma_gap_update(vma);
-- vma_rb_insert(vma, &mm->mm_rb);
-+ vma_rb_insert(vma, mm);
-+ mm_rb_write_unlock(mm);
- }
-
- static void __vma_link_file(struct vm_area_struct *vma)
-@@ -646,7 +685,7 @@ static __always_inline void __vma_unlink_common(struct mm_struct *mm,
- {
- struct vm_area_struct *next;
-
-- vma_rb_erase_ignore(vma, &mm->mm_rb, ignore);
-+ vma_rb_erase_ignore(vma, mm, ignore);
- next = vma->vm_next;
- if (has_prev)
- prev->vm_next = next;
-@@ -923,16 +962,13 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start,
- }
-
- if (remove_next) {
-- if (file) {
-+ if (file)
- uprobe_munmap(next, next->vm_start, next->vm_end);
-- fput(file);
-- }
- if (next->anon_vma)
- anon_vma_merge(vma, next);
- mm->map_count--;
-- mpol_put(vma_policy(next));
- vm_raw_write_end(next);
-- kmem_cache_free(vm_area_cachep, next);
-+ put_vma(next);
- /*
- * In mprotect's case 6 (see comments on vma_merge),
- * we must remove another next too. It would clutter
-@@ -2190,15 +2226,11 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
- EXPORT_SYMBOL(get_unmapped_area);
-
- /* Look up the first VMA which satisfies addr < vm_end, NULL if none. */
--struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
-+static struct vm_area_struct *__find_vma(struct mm_struct *mm,
-+ unsigned long addr)
- {
- struct rb_node *rb_node;
-- struct vm_area_struct *vma;
--
-- /* Check the cache first. */
-- vma = vmacache_find(mm, addr);
-- if (likely(vma))
-- return vma;
-+ struct vm_area_struct *vma = NULL;
-
- rb_node = mm->mm_rb.rb_node;
-
-@@ -2216,13 +2248,40 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
- rb_node = rb_node->rb_right;
- }
-
-+ return vma;
-+}
-+
-+struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
-+{
-+ struct vm_area_struct *vma;
-+
-+ /* Check the cache first. */
-+ vma = vmacache_find(mm, addr);
-+ if (likely(vma))
-+ return vma;
-+
-+ vma = __find_vma(mm, addr);
- if (vma)
- vmacache_update(addr, vma);
- return vma;
- }
--
- EXPORT_SYMBOL(find_vma);
-
-+#ifdef CONFIG_SPECULATIVE_PAGE_FAULT
-+struct vm_area_struct *get_vma(struct mm_struct *mm, unsigned long addr)
-+{
-+ struct vm_area_struct *vma = NULL;
-+
-+ read_lock(&mm->mm_rb_lock);
-+ vma = __find_vma(mm, addr);
-+ if (vma)
-+ atomic_inc(&vma->vm_ref_count);
-+ read_unlock(&mm->mm_rb_lock);
-+
-+ return vma;
-+}
-+#endif
-+
- /*
- * Same as find_vma, but also return a pointer to the previous VMA in *pprev.
- */
-@@ -2590,7 +2649,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
- insertion_point = (prev ? &prev->vm_next : &mm->mmap);
- vma->vm_prev = NULL;
- do {
-- vma_rb_erase(vma, &mm->mm_rb);
-+ vma_rb_erase(vma, mm);
- mm->map_count--;
- tail_vma = vma;
- vma = vma->vm_next;
+ PERF_COUNT_SW_MAX, /* non-ABI */
+ };
--
2.7.4
+
+--
+To unsubscribe, send a message with 'unsubscribe linux-mm' in
+the body to majordomo@kvack.org. For more info on Linux MM,
+see: http://www.linux-mm.org/ .
+Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>