[PATCH v4 06/66] mm: Start tracking VMAs with maple tree
From: Liam Howlett <hidden>
Date: 2021-12-01 14:33:45
Also in:
lkml
Subsystem:
exec & binfmt api, elf, extensible firmware interface (efi), intel(r) trusted execution technology (txt), memory management, memory management - core, memory mapping, scheduler, the rest, tracing, x86 architecture (32-bit and 64-bit) · Maintainers:
Kees Cook, Ard Biesheuvel, Ning Sun, Andrew Morton, David Hildenbrand, Liam R. Howlett, Lorenzo Stoakes, Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Linus Torvalds, Steven Rostedt, Masami Hiramatsu, Thomas Gleixner, Borislav Petkov, Dave Hansen
From: "Liam R. Howlett" <redacted> Start tracking the VMAs with the new maple tree structure in parallel with the rb_tree. Add debug and trace events for maple tree operations and duplicate the rb_tree that is created on forks into the maple tree. The maple tree is added to the mm_struct including the mm_init struct, added support in required mm/mmap functions, added tracking in kernel/fork for process forking, and used to find the unmapped_area and checked against what the rbtree finds. Signed-off-by: Liam R. Howlett <redacted> Signed-off-by: Matthew Wilcox (Oracle) <willy@infradead.org> --- arch/x86/kernel/tboot.c | 1 + drivers/firmware/efi/efi.c | 1 + include/linux/mm.h | 2 + include/linux/mm_types.h | 3 + include/trace/events/mmap.h | 71 ++++++++++++ init/main.c | 2 + kernel/fork.c | 5 + mm/init-mm.c | 2 + mm/internal.h | 73 ++++++++++++ mm/mmap.c | 220 +++++++++++++++++++++++++++++++++++- 10 files changed, 378 insertions(+), 2 deletions(-)
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index f9af561c3cd4..859e8d2ea070 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c@@ -98,6 +98,7 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1),
diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c
index ae79c3300129..0b40291416ca 100644
--- a/drivers/firmware/efi/efi.c
+++ b/drivers/firmware/efi/efi.c@@ -55,6 +55,7 @@ static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), .write_protect_seq = SEQCNT_ZERO(efi_mm.write_protect_seq),
diff --git a/include/linux/mm.h b/include/linux/mm.h
index a7e4a9e7d807..9eae78a155be 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h@@ -2610,6 +2610,8 @@ extern bool arch_has_descending_max_zone_pfns(void); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; extern int nommu_shrink_inode_mappings(struct inode *, size_t, size_t); +/* maple_tree */ +void vma_store(struct mm_struct *mm, struct vm_area_struct *vma); /* interval_tree.c */ void vma_interval_tree_insert(struct vm_area_struct *node,
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h
index c3a6e6209600..d9ce412fca04 100644
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h@@ -8,6 +8,7 @@ #include <linux/list.h> #include <linux/spinlock.h> #include <linux/rbtree.h> +#include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/completion.h> #include <linux/cpumask.h>
@@ -467,6 +468,7 @@ struct kioctx_table; struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ + struct maple_tree mm_mt; struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU
@@ -655,6 +657,7 @@ struct mm_struct { unsigned long cpu_bitmap[]; }; +#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN) extern struct mm_struct init_mm; /* Pointer magic because the dynamic array size confuses some compilers. */
diff --git a/include/trace/events/mmap.h b/include/trace/events/mmap.h
index 4661f7ba07c0..f5c1626f3bbb 100644
--- a/include/trace/events/mmap.h
+++ b/include/trace/events/mmap.h@@ -42,6 +42,77 @@ TRACE_EVENT(vm_unmapped_area, __entry->low_limit, __entry->high_limit, __entry->align_mask, __entry->align_offset) ); + +TRACE_EVENT(vma_mt_szero, + TP_PROTO(struct mm_struct *mm, unsigned long start, + unsigned long end), + + TP_ARGS(mm, start, end), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, start) + __field(unsigned long, end) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->start = start; + __entry->end = end - 1; + ), + + TP_printk("mt_mod %p, (NULL), SNULL, %lu, %lu,", + __entry->mm, + (unsigned long) __entry->start, + (unsigned long) __entry->end + ) +); + +TRACE_EVENT(vma_store, + TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma), + + TP_ARGS(mm, vma), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(struct vm_area_struct *, vma) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->vma = vma; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end - 1; + ), + + TP_printk("mt_mod %p, (%p), STORE, %lu, %lu,", + __entry->mm, __entry->vma, + (unsigned long) __entry->vm_start, + (unsigned long) __entry->vm_end + ) +); + + +TRACE_EVENT(exit_mmap, + TP_PROTO(struct mm_struct *mm), + + TP_ARGS(mm), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + ), + + TP_fast_assign( + __entry->mm = mm; + ), + + TP_printk("mt_mod %p, DESTROY\n", + __entry->mm + ) +); + #endif /* This part must be outside protection */
diff --git a/init/main.c b/init/main.c
index bb984ed79de0..ed906a15a86c 100644
--- a/init/main.c
+++ b/init/main.c@@ -115,6 +115,7 @@ static int kernel_init(void *); extern void init_IRQ(void); extern void radix_tree_init(void); +extern void maple_tree_init(void); /* * Debug helper: via this flag we know that we are in 'early bootup code'
@@ -997,6 +998,7 @@ asmlinkage __visible void __init __no_sanitize_address start_kernel(void) "Interrupts were enabled *very* early, fixing it\n")) local_irq_disable(); radix_tree_init(); + maple_tree_init(); /* * Set up housekeeping before setting up workqueues to allow the unbound
diff --git a/kernel/fork.c b/kernel/fork.c
index 3244cc56b697..cc9bb95c7678 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c@@ -604,6 +604,9 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, rb_link = &tmp->vm_rb.rb_right; rb_parent = &tmp->vm_rb; + /* Link the vma into the MT */ + vma_store(mm, tmp); + mm->map_count++; if (!(tmp->vm_flags & VM_WIPEONFORK)) retval = copy_page_range(tmp, mpnt);
@@ -1037,6 +1040,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, { mm->mmap = NULL; mm->mm_rb = RB_ROOT; + mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); + mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1);
diff --git a/mm/init-mm.c b/mm/init-mm.c
index b4a6f38fb51d..7622ca24eeb7 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c@@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/mm_types.h> #include <linux/rbtree.h> +#include <linux/maple_tree.h> #include <linux/rwsem.h> #include <linux/spinlock.h> #include <linux/list.h>
@@ -28,6 +29,7 @@ */ struct mm_struct init_mm = { .mm_rb = RB_ROOT, + .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1),
diff --git a/mm/internal.h b/mm/internal.h
index 3b79a5c9427a..7ec79cef3ea9 100644
--- a/mm/internal.h
+++ b/mm/internal.h@@ -382,6 +382,79 @@ static inline bool is_data_mapping(vm_flags_t flags) return (flags & (VM_WRITE | VM_SHARED | VM_STACK)) == VM_WRITE; } +/* Maple tree operations using VMAs */ +/* + * vma_mas_store() - Store a VMA in the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to store a VMA in the maple tree when the @mas has already + * walked to the correct location. + * + * Note: the end address is inclusive in the maple tree. + */ +static inline int vma_mas_store(struct vm_area_struct *vma, struct ma_state *mas) +{ + int ret; + +#ifdef CONFIG_DEBUG_MAPLE_TREE + /* Make sure no VMAs are about to be lost. */ + MA_STATE(test, mas->tree, vma->vm_start, vma->vm_end - 1); + struct vm_area_struct *vma_mas; + int count = 0; + + mas_for_each(&test, vma_mas, vma->vm_end - 1) { + /* Rule out vma_expand */ + if ((vma->vm_start != vma_mas->vm_start) && + (vma->vm_end != vma_mas->vm_end)) + count++; + } + + BUG_ON(count); + + BUG_ON(mas->min > vma->vm_start); + BUG_ON(mas->index > vma->vm_start); +#endif + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + ret = mas_store_gfp(mas, vma, GFP_KERNEL); + return ret; +} + +/* + * vma_mas_remove() - Remove a VMA from the maple tree. + * @vma: The vm_area_struct + * @mas: The maple state + * + * Efficient way to remove a VMA from the maple tree when the @mas has already + * been established and points to the correct location. + * Note: the end address is inclusive in the maple tree. + */ +static inline int vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) +{ + int ret; + +#ifdef CONFIG_DEBUG_MAPLE_TREE + /* Make sure no VMAs are about to be lost. */ + MA_STATE(test, mas->tree, vma->vm_start, vma->vm_end - 1); + struct vm_area_struct *vma_mas; + int count = 0; + + mas_for_each(&test, vma_mas, vma->vm_end - 1) + count++; + + BUG_ON(count != 1); + + BUG_ON(mas->min > vma->vm_start); + BUG_ON(mas->index > vma->vm_start); + BUG_ON(mas->min > mas->index); +#endif + mas->index = vma->vm_start; + mas->last = vma->vm_end - 1; + ret = mas_store_gfp(mas, NULL, GFP_KERNEL); + return ret; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev);
diff --git a/mm/mmap.c b/mm/mmap.c
index bfb0ea164a90..c2f1431886d4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c@@ -375,7 +375,70 @@ static int browse_rb(struct mm_struct *mm) } return bug ? -1 : i; } +#if defined(CONFIG_DEBUG_MAPLE_TREE) +extern void mt_validate(struct maple_tree *mt); +extern void mt_dump(const struct maple_tree *mt); +/* Validate the maple tree */ +static void validate_mm_mt(struct mm_struct *mm) +{ + struct maple_tree *mt = &mm->mm_mt; + struct vm_area_struct *vma_mt, *vma = mm->mmap; + + MA_STATE(mas, mt, 0, 0); + mas_for_each(&mas, vma_mt, ULONG_MAX) { + if (xa_is_zero(vma_mt)) + continue; + + if (!vma) + break; + + if ((vma != vma_mt) || + (vma->vm_start != vma_mt->vm_start) || + (vma->vm_end != vma_mt->vm_end) || + (vma->vm_start != mas.index) || + (vma->vm_end - 1 != mas.last)) { + pr_emerg("issue in %s\n", current->comm); + dump_stack(); +#ifdef CONFIG_DEBUG_VM + dump_vma(vma_mt); + pr_emerg("and next in rb\n"); + dump_vma(vma->vm_next); +#endif + pr_emerg("mt piv: %px %lu - %lu\n", vma_mt, + mas.index, mas.last); + pr_emerg("mt vma: %px %lu - %lu\n", vma_mt, + vma_mt->vm_start, vma_mt->vm_end); + pr_emerg("rb vma: %px %lu - %lu\n", vma, + vma->vm_start, vma->vm_end); + pr_emerg("rb->next = %px %lu - %lu\n", vma->vm_next, + vma->vm_next->vm_start, vma->vm_next->vm_end); + + mt_dump(mas.tree); + if (vma_mt->vm_end != mas.last + 1) { + pr_err("vma: %px vma_mt %lu-%lu\tmt %lu-%lu\n", + mm, vma_mt->vm_start, vma_mt->vm_end, + mas.index, mas.last); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_end != mas.last + 1, mm); + if (vma_mt->vm_start != mas.index) { + pr_err("vma: %px vma_mt %px %lu - %lu doesn't match\n", + mm, vma_mt, vma_mt->vm_start, vma_mt->vm_end); + mt_dump(mas.tree); + } + VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); + } + VM_BUG_ON(vma != vma_mt); + vma = vma->vm_next; + + } + VM_BUG_ON(vma); + mt_validate(&mm->mm_mt); +} +#else +#define validate_mm_mt(root) do { } while (0) +#endif static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) { struct rb_node *nd;
@@ -430,6 +493,7 @@ static void validate_mm(struct mm_struct *mm) } #else #define validate_mm_rb(root, ignore) do { } while (0) +#define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) #endif
@@ -674,11 +738,42 @@ static void __vma_link_file(struct vm_area_struct *vma) } } +/* + * vma_mt_szero() - Set a given range to zero. Used when modifying a + * vm_area_struct start or end. + * + * @mm: The struct_mm + * @start: The start address to zero + * @end: The end address to zero. + */ +static inline void vma_mt_szero(struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + MA_STATE(mas, &mm->mm_mt, start, end - 1); + + trace_vma_mt_szero(mm, start, end); + mas_store_gfp(&mas, NULL, GFP_KERNEL); +} + +/* + * vma_store() - Store a given vm_area_struct in the maple tree. + * @mm: The struct_mm + * @vma: The vm_area_struct to store in the maple tree. + */ +void vma_store(struct mm_struct *mm, struct vm_area_struct *vma) +{ + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end - 1); + + trace_vma_store(mm, vma); + mas_store_gfp(&mas, vma, GFP_KERNEL); +} + static void __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node **rb_link, struct rb_node *rb_parent) { + vma_store(mm, vma); __vma_link_list(mm, vma, prev); __vma_link_rb(mm, vma, rb_link, rb_parent); }
@@ -751,6 +846,9 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, long adjust_next = 0; int remove_next = 0; + validate_mm(mm); + validate_mm_mt(mm); + if (next && !insert) { struct vm_area_struct *exporter = NULL, *importer = NULL;
@@ -876,17 +974,28 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { + unsigned long old_start = vma->vm_start; vma->vm_start = start; + if (old_start < start) + vma_mt_szero(mm, old_start, start); start_changed = true; } if (end != vma->vm_end) { + unsigned long old_end = vma->vm_end; vma->vm_end = end; + if (old_end > end) + vma_mt_szero(mm, end, old_end); end_changed = true; } + + if (end_changed || start_changed) + vma_store(mm, vma); + vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next; next->vm_pgoff += adjust_next >> PAGE_SHIFT; + vma_store(mm, next); } if (file) {
@@ -900,6 +1009,8 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, /* * vma_merge has merged next into vma, and needs * us to remove next before dropping the locks. + * Since we have expanded over this vma, the maple tree will + * have overwritten by storing the value */ if (remove_next != 3) __vma_unlink(mm, next, next);
@@ -1019,6 +1130,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_mmap(insert); validate_mm(mm); + validate_mm_mt(mm); return 0; }
@@ -1166,6 +1278,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; + validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too.
@@ -1241,6 +1354,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma_merge(area, vm_flags); return area; } + validate_mm_mt(mm); return NULL; }
@@ -1722,6 +1836,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; + validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages;
@@ -1869,6 +1984,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vma_set_page_prot(vma); + validate_mm_mt(mm); return addr; unmap_and_free_vma:
@@ -1885,6 +2001,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm_mt(mm); return error; }
@@ -1901,12 +2018,21 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + MA_STATE(mas, &mm->mm_mt, 0, 0); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + rcu_read_lock(); + mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length); + rcu_read_unlock(); + gap = mas.index; + gap += (info->align_offset - gap) & info->align_mask; + /* Adjust search limits by the desired length */ if (info->high_limit < length) return -ENOMEM;
@@ -1988,20 +2114,39 @@ static unsigned long unmapped_area(struct vm_unmapped_area_info *info) VM_BUG_ON(gap_start + info->length > info->high_limit); VM_BUG_ON(gap_start + info->length > gap_end); + + VM_BUG_ON(gap != gap_start); return gap_start; } +static inline unsigned long top_area_aligned(struct vm_unmapped_area_info *info, + unsigned long end) +{ + return (end - info->length - info->align_offset) & (~info->align_mask); +} + static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma = NULL; unsigned long length, low_limit, high_limit, gap_start, gap_end; + unsigned long gap; + + MA_STATE(mas, &mm->mm_mt, 0, 0); + validate_mm_mt(mm); /* Adjust search length to account for worst case alignment overhead */ length = info->length + info->align_mask; if (length < info->length) return -ENOMEM; + rcu_read_lock(); + mas_empty_area_rev(&mas, info->low_limit, info->high_limit - 1, + length); + rcu_read_unlock(); + gap = (mas.index + info->align_mask) & ~info->align_mask; + gap -= info->align_offset & info->align_mask; + /* * Adjust search limits by the desired length. * See implementation comment at top of unmapped_area().
@@ -2087,6 +2232,32 @@ static unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) VM_BUG_ON(gap_end < info->low_limit); VM_BUG_ON(gap_end < gap_start); + + if (gap != gap_end) { + pr_err("%s: %px Gap was found: mt %lu gap_end %lu\n", __func__, + mm, gap, gap_end); + pr_err("window was %lu - %lu size %lu\n", info->high_limit, + info->low_limit, length); + pr_err("mas.min %lu max %lu mas.last %lu\n", mas.min, mas.max, + mas.last); + pr_err("mas.index %lu align mask %lu offset %lu\n", mas.index, + info->align_mask, info->align_offset); + pr_err("rb_find_vma find on %lu => %px (%px)\n", mas.index, + find_vma(mm, mas.index), vma); +#if defined(CONFIG_DEBUG_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + { + struct vm_area_struct *dv = mm->mmap; + + while (dv) { + printk("vma %px %lu-%lu\n", dv, dv->vm_start, dv->vm_end); + dv = dv->vm_next; + } + } + VM_BUG_ON(gap != gap_end); + } + return gap_end; }
@@ -2300,7 +2471,6 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) vmacache_update(addr, vma); return vma; } - EXPORT_SYMBOL(find_vma); /*
@@ -2381,6 +2551,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) unsigned long gap_addr; int error = 0; + validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT;
@@ -2457,6 +2628,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma_merge(vma, vma->vm_flags); validate_mm(mm); + validate_mm_mt(mm); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */
@@ -2471,6 +2643,7 @@ int expand_downwards(struct vm_area_struct *vma, struct vm_area_struct *prev; int error = 0; + validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM;
@@ -2524,6 +2697,8 @@ int expand_downwards(struct vm_area_struct *vma, anon_vma_interval_tree_pre_update_vma(vma); vma->vm_start = address; vma->vm_pgoff -= grow; + /* Overwrite old entry in mtree. */ + vma_store(mm, vma); anon_vma_interval_tree_post_update_vma(vma); vma_gap_update(vma); spin_unlock(&mm->page_table_lock);
@@ -2665,6 +2840,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; + vma_mt_szero(mm, vma->vm_start, end); do { vma_rb_erase(vma, &mm->mm_rb); mm->map_count--;
@@ -2703,6 +2879,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct *new; int err; + validate_mm_mt(mm); if (vma->vm_ops && vma->vm_ops->may_split) { err = vma->vm_ops->may_split(vma, addr);
@@ -2755,6 +2932,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, mpol_put(vma_policy(new)); out_free_vma: vm_area_free(new); + validate_mm_mt(mm); return err; }
@@ -3026,6 +3204,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; + validate_mm_mt(mm); /* Until we need other flags, refuse anything except VM_EXEC. */ if ((flags & (~VM_EXEC)) != 0)
@@ -3083,6 +3262,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; + validate_mm_mt(mm); return 0; }
@@ -3153,6 +3333,13 @@ void exit_mmap(struct mm_struct *mm) mmap_write_unlock(mm); } + /* + * Actually taking the mmap semaphore here costs 3% performance on + * a large machine: + * https://lore.kernel.org/lkml/20170725151754.3txp44a2kbffsxdg@node.shutemov.name/ + * Lockdep will complain about not holding the mmap_lock, so we lie. + */ + rwsem_acquire(&mm->mmap_lock.dep_map, 0, 0, _THIS_IP_); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX);
@@ -3181,6 +3368,11 @@ void exit_mmap(struct mm_struct *mm) vma = remove_vma(vma); cond_resched(); } + + trace_exit_mmap(mm); + __mt_destroy(&mm->mm_mt); + rwsem_release(&mm->mmap_lock.dep_map, _THIS_IP_); + vm_unacct_memory(nr_accounted); }
@@ -3192,10 +3384,25 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; struct rb_node **rb_link, *rb_parent; + unsigned long start = vma->vm_start; + struct vm_area_struct *overlap = NULL; if (find_vma_links(mm, vma->vm_start, vma->vm_end, &prev, &rb_link, &rb_parent)) return -ENOMEM; + + overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); + if (overlap) { + + pr_err("Found vma ending at %lu\n", start - 1); + pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, + overlap->vm_start, overlap->vm_end - 1); +#if defined(CONFIG_DEBUG_MAPLE_TREE) + mt_dump(&mm->mm_mt); +#endif + BUG(); + } + if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM;
@@ -3235,7 +3442,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, struct vm_area_struct *new_vma, *prev; struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; + unsigned long index = addr; + validate_mm_mt(mm); /* * If anonymous vma has not yet been faulted, update new pgoff * to match new location, to increase its chance of merging.
@@ -3247,6 +3456,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) return NULL; /* should never get here */ + if (mt_find(&mm->mm_mt, &index, addr+len - 1)) + BUG(); new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx);
@@ -3290,6 +3501,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, vma_link(mm, new_vma, prev, rb_link, rb_parent); *need_rmap_locks = false; } + validate_mm_mt(mm); return new_vma; out_free_mempol:
@@ -3297,6 +3509,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, out_free_vma: vm_area_free(new_vma); out: + validate_mm_mt(mm); return NULL; }
@@ -3433,6 +3646,7 @@ static struct vm_area_struct *__install_special_mapping( int ret; struct vm_area_struct *vma; + validate_mm_mt(mm); vma = vm_area_alloc(mm); if (unlikely(vma == NULL)) return ERR_PTR(-ENOMEM);
@@ -3454,10 +3668,12 @@ static struct vm_area_struct *__install_special_mapping( perf_event_mmap(vma); + validate_mm_mt(mm); return vma; out: vm_area_free(vma); + validate_mm_mt(mm); return ERR_PTR(ret); }
--
2.30.2