Thread (91 messages) 91 messages, 9 authors, 2022-09-29

Re: [RFC PATCH RESEND 28/28] kernel/fork: throttle call_rcu() calls in vm_area_free

From: Laurent Dufour <hidden>
Date: 2022-09-09 16:15:10
Also in: linux-arm-kernel, linux-mm, lkml

Le 09/09/2022 à 18:02, Suren Baghdasaryan a écrit :
On Fri, Sep 9, 2022 at 8:19 AM Laurent Dufour [off-list ref] wrote:
quoted
Le 01/09/2022 à 19:35, Suren Baghdasaryan a écrit :
quoted
call_rcu() can take a long time when callback offloading is enabled.
Its use in the vm_area_free can cause regressions in the exit path when
multiple VMAs are being freed. To minimize that impact, place VMAs into
a list and free them in groups using one call_rcu() call per group.

Signed-off-by: Suren Baghdasaryan <surenb@google.com>
---
 include/linux/mm.h       |  1 +
 include/linux/mm_types.h | 11 ++++++-
 kernel/fork.c            | 68 +++++++++++++++++++++++++++++++++++-----
 mm/init-mm.c             |  3 ++
 mm/mmap.c                |  1 +
 5 files changed, 75 insertions(+), 9 deletions(-)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a3cbaa7b9119..81dff694ac14 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -249,6 +249,7 @@ void setup_initial_init_mm(void *start_code, void *end_code,
 struct vm_area_struct *vm_area_alloc(struct mm_struct *);
 struct vm_area_struct *vm_area_dup(struct vm_area_struct *);
 void vm_area_free(struct vm_area_struct *);
+void drain_free_vmas(struct mm_struct *mm);

 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index 36562e702baf..6f3effc493b1 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -412,7 +412,11 @@ struct vm_area_struct {
                      struct vm_area_struct *vm_next, *vm_prev;
              };
 #ifdef CONFIG_PER_VMA_LOCK
-             struct rcu_head vm_rcu; /* Used for deferred freeing. */
+             struct {
+                     struct list_head vm_free_list;
+                     /* Used for deferred freeing. */
+                     struct rcu_head vm_rcu;
+             };
 #endif
      };
@@ -573,6 +577,11 @@ struct mm_struct {
                                        */
 #ifdef CONFIG_PER_VMA_LOCK
              int mm_lock_seq;
+             struct {
+                     struct list_head head;
+                     spinlock_t lock;
+                     int size;
+             } vma_free_list;
 #endif

diff --git a/kernel/fork.c b/kernel/fork.c
index b443ba3a247a..7c88710aed72 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -483,26 +483,75 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
 }

 #ifdef CONFIG_PER_VMA_LOCK
-static void __vm_area_free(struct rcu_head *head)
+static inline void __vm_area_free(struct vm_area_struct *vma)
 {
-     struct vm_area_struct *vma = container_of(head, struct vm_area_struct,
-                                               vm_rcu);
      /* The vma should either have no lock holders or be write-locked. */
      vma_assert_no_reader(vma);
      kmem_cache_free(vm_area_cachep, vma);
 }
-#endif
+
+static void vma_free_rcu_callback(struct rcu_head *head)
+{
+     struct vm_area_struct *first_vma;
+     struct vm_area_struct *vma, *vma2;
+
+     first_vma = container_of(head, struct vm_area_struct, vm_rcu);
+     list_for_each_entry_safe(vma, vma2, &first_vma->vm_free_list, vm_free_list)
Is that safe to walk the list against concurrent calls to
list_splice_init(), or list_add()?
I think it is. drain_free_vmas() moves the to-be-destroyed and already
isolated VMAs from mm->vma_free_list into to_destroy list and then
passes that list to vma_free_rcu_callback(). At this point the list of
VMAs passed to vma_free_rcu_callback() is not accessible either from
mm (VMAs were isolated before vm_area_free() was called) or from
drain_free_vmas() since they were already removed from
mm->vma_free_list. Does that make sense?
Got it!
Thanks for the explanation.
quoted
quoted
+             __vm_area_free(vma);
+     __vm_area_free(first_vma);
+}
+
+void drain_free_vmas(struct mm_struct *mm)
+{
+     struct vm_area_struct *first_vma;
+     LIST_HEAD(to_destroy);
+
+     spin_lock(&mm->vma_free_list.lock);
+     list_splice_init(&mm->vma_free_list.head, &to_destroy);
+     mm->vma_free_list.size = 0;
+     spin_unlock(&mm->vma_free_list.lock);
+
+     if (list_empty(&to_destroy))
+             return;
+
+     first_vma = list_first_entry(&to_destroy, struct vm_area_struct, vm_free_list);
+     /* Remove the head which is allocated on the stack */
+     list_del(&to_destroy);
+
+     call_rcu(&first_vma->vm_rcu, vma_free_rcu_callback);
+}
+
+#define VM_AREA_FREE_LIST_MAX        32
+
+void vm_area_free(struct vm_area_struct *vma)
+{
+     struct mm_struct *mm = vma->vm_mm;
+     bool drain;
+
+     free_anon_vma_name(vma);
+
+     spin_lock(&mm->vma_free_list.lock);
+     list_add(&vma->vm_free_list, &mm->vma_free_list.head);
+     mm->vma_free_list.size++;
+     drain = mm->vma_free_list.size > VM_AREA_FREE_LIST_MAX;
+     spin_unlock(&mm->vma_free_list.lock);
+
+     if (drain)
+             drain_free_vmas(mm);
+}
+
+#else /* CONFIG_PER_VMA_LOCK */
+
+void drain_free_vmas(struct mm_struct *mm) {}

 void vm_area_free(struct vm_area_struct *vma)
 {
      free_anon_vma_name(vma);
-#ifdef CONFIG_PER_VMA_LOCK
-     call_rcu(&vma->vm_rcu, __vm_area_free);
-#else
      kmem_cache_free(vm_area_cachep, vma);
-#endif
 }

+#endif /* CONFIG_PER_VMA_LOCK */
+
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
      if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@@ -1137,6 +1186,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
      INIT_LIST_HEAD(&mm->mmlist);
 #ifdef CONFIG_PER_VMA_LOCK
      WRITE_ONCE(mm->mm_lock_seq, 0);
+     INIT_LIST_HEAD(&mm->vma_free_list.head);
+     spin_lock_init(&mm->vma_free_list.lock);
+     mm->vma_free_list.size = 0;
 #endif
      mm_pgtables_bytes_init(mm);
      mm->map_count = 0;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 8399f90d631c..7b6d2460545f 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -39,6 +39,9 @@ struct mm_struct init_mm = {
      .mmlist         = LIST_HEAD_INIT(init_mm.mmlist),
 #ifdef CONFIG_PER_VMA_LOCK
      .mm_lock_seq    = 0,
+     .vma_free_list.head = LIST_HEAD_INIT(init_mm.vma_free_list.head),
+     .vma_free_list.lock =  __SPIN_LOCK_UNLOCKED(init_mm.vma_free_list.lock),
+     .vma_free_list.size = 0,
 #endif
      .user_ns        = &init_user_ns,
      .cpu_bitmap     = CPU_BITS_NONE,
diff --git a/mm/mmap.c b/mm/mmap.c
index 1edfcd384f5e..d61b7ef84ba6 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -3149,6 +3149,7 @@ void exit_mmap(struct mm_struct *mm)
      }
      mm->mmap = NULL;
      mmap_write_unlock(mm);
+     drain_free_vmas(mm);
      vm_unacct_memory(nr_accounted);
 }
  
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help