[PATCH v3 7/7] ARM: implement support for vmap'ed stacks
From: Ard Biesheuvel <ardb@kernel.org>
Date: 2021-11-15 11:21:53
Subsystem:
arm port, the rest · Maintainers:
Russell King, Linus Torvalds
Wire up the generic support for managing task stack allocations via vmalloc, and implement the entry code that detects whether we faulted because of a stack overrun (or future stack overrun caused by pushing the pt_regs array) While this adds a fair amount of tricky entry asm code, it should be noted that it only adds a TST + branch to the svc_entry path. The code implementing the non-trivial handling of the overflow stack is emitted out-of-line into the .text section. Since on ARM, we rely on do_translation_fault() to keep PMD level page table entries that cover the vmalloc region up to date, we need to ensure that we don't hit such a stale PMD entry when accessing the stack. So we do a dummy read from the new stack while still running from the old one on the context switch path, and bump the vmalloc_seq counter when PMD level entries in the vmalloc range are modified, so that the MM switch fetches the latest version of the entries. Signed-off-by: Ard Biesheuvel <ardb@kernel.org> Tested-by: Keith Packard <redacted> --- arch/arm/Kconfig | 1 + arch/arm/include/asm/page.h | 4 + arch/arm/include/asm/thread_info.h | 8 ++ arch/arm/kernel/entry-armv.S | 79 ++++++++++++++++++-- arch/arm/kernel/entry-header.S | 57 ++++++++++++++ arch/arm/kernel/irq.c | 9 ++- arch/arm/kernel/traps.c | 65 +++++++++++++++- arch/arm/kernel/unwind.c | 3 +- arch/arm/kernel/vmlinux.lds.S | 4 +- 9 files changed, 219 insertions(+), 11 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index b1eba1b4168c..a072600527ca 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig@@ -127,6 +127,7 @@ config ARM select RTC_LIB select SYS_SUPPORTS_APM_EMULATION select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO + select HAVE_ARCH_VMAP_STACK if THREAD_INFO_IN_TASK select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M # Above selects are sorted alphabetically; please add new ones # according to that. Thanks.
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 11b058a72a5b..7b871ed99ccf 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h@@ -149,6 +149,10 @@ extern void copy_page(void *to, const void *from); #include <asm/pgtable-2level-types.h> #endif +#ifdef CONFIG_VMAP_STACK +#define ARCH_PAGE_TABLE_SYNC_MASK PGTBL_PMD_MODIFIED +#endif + #endif /* CONFIG_MMU */ typedef struct page *pgtable_t;
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 164e15f26485..004b89d86224 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h@@ -25,6 +25,14 @@ #define THREAD_SIZE (PAGE_SIZE << THREAD_SIZE_ORDER) #define THREAD_START_SP (THREAD_SIZE - 8) +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN (2 * THREAD_SIZE) +#else +#define THREAD_ALIGN THREAD_SIZE +#endif + +#define OVERFLOW_STACK_SIZE SZ_4K + #ifndef __ASSEMBLY__ struct task_struct;
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index b18f3aa98f42..ad8d8304539e 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S@@ -57,6 +57,14 @@ UNWIND( .setfp fpreg, sp ) @ subs r2, sp, r0 @ SP above bottom of IRQ stack? rsbscs r2, r2, #THREAD_SIZE @ ... and below the top? +#ifdef CONFIG_VMAP_STACK + bcs .L\@ + mov_l r2, overflow_stack @ Take base address + add r2, r2, r3 @ Top of this CPU's overflow stack + subs r2, r0, r2 @ Compare with incoming SP + rsbscs r2, r2, #OVERFLOW_STACK_SIZE +.L\@: +#endif movcs sp, r0 @ If so, revert to incoming SP #ifndef CONFIG_UNWINDER_ARM
@@ -188,13 +196,18 @@ ENDPROC(__und_invalid) #define SPFIX(code...) #endif - .macro svc_entry, stack_hole=0, trace=1, uaccess=1 + .macro svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1 UNWIND(.fnstart ) - UNWIND(.save {r0 - pc} ) sub sp, sp, #(SVC_REGS_SIZE + \stack_hole) + THUMB( add sp, r0 ) @ get SP in a GPR without + THUMB( sub r0, sp, r0 ) @ using a temp register + + .if \overflow_check + UNWIND(.save {r0 - pc} ) + do_overflow_check (SVC_REGS_SIZE + \stack_hole) + .endif + #ifdef CONFIG_THUMB2_KERNEL - add sp, r0 @ get SP in a GPR without - sub r0, sp, r0 @ using a temp register tst r0, #4 @ test stack pointer alignment sub r0, sp, r0 @ restore original R0 sub sp, r0 @ restore original SP
@@ -827,12 +840,20 @@ ENTRY(__switch_to) str r7, [r8] #endif mov r0, r5 -#if !defined(CONFIG_THUMB2_KERNEL) +#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK) set_current r7 ldmia r4, {r4 - sl, fp, sp, pc} @ Load all regs saved previously #else mov r1, r7 ldmia r4, {r4 - sl, fp, ip, lr} @ Load all regs saved previously +#ifdef CONFIG_VMAP_STACK + @ + @ Do a dummy read from the new stack while running from the old one so + @ that we can rely on do_translation_fault() to fix up any stale PMD + @ entries covering the vmalloc region. + @ + ldr r2, [ip] +#endif @ When CONFIG_THREAD_INFO_IN_TASK=n, the update of SP itself is what @ effectuates the task switch, as that is what causes the observable
@@ -849,6 +870,54 @@ ENTRY(__switch_to) UNWIND(.fnend ) ENDPROC(__switch_to) +#ifdef CONFIG_VMAP_STACK + .text +__bad_stack: + @ + @ We detected an overflow in svc_entry, which switched to the + @ overflow stack. Stash the exception regs, and head to our overflow + @ handler. Entered with the orginal value of SP in IP, and the original + @ value of IP in TPIDRURW + @ + +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) + mov ip, ip @ mov expected by unwinder + push {fp, ip, lr, pc} @ GCC flavor frame record +#else + str ip, [sp, #-8]! @ store original SP + push {fpreg, lr} @ Clang flavor frame record +#endif +UNWIND( ldr ip, [r0, #4] ) @ load exception LR +UNWIND( str ip, [sp, #12] ) @ store in the frame record + mrc p15, 0, ip, c13, c0, 2 @ reload IP + + @ Store the original GPRs to the new stack. + svc_entry uaccess=0, overflow_check=0 + +UNWIND( .save {sp, pc} ) +UNWIND( .save {fpreg, lr} ) +UNWIND( .setfp fpreg, sp ) + + ldr fpreg, [sp, #S_SP] @ Add our frame record + @ to the linked list +#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC) + ldr r1, [fp, #4] @ reload SP at entry + add fp, fp, #12 +#else + ldr r1, [fpreg, #8] +#endif + str r1, [sp, #S_SP] @ store in pt_regs + + @ Stash the regs for handle_bad_stack + mov r0, sp + + @ Time to die + bl handle_bad_stack + nop +UNWIND( .fnend ) +ENDPROC(__bad_stack) +#endif + __INIT /*
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index ae24dd54e9ef..823dd1aa6e3e 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S@@ -423,3 +423,60 @@ scno .req r7 @ syscall number tbl .req r8 @ syscall table pointer why .req r8 @ Linux syscall (!= 0) tsk .req r9 @ current thread_info + + .macro do_overflow_check, frame_size:req +#ifdef CONFIG_VMAP_STACK + @ + @ Test whether the SP has overflowed. Task and IRQ stacks are aligned + @ so that SP & BIT(THREAD_SIZE_ORDER + PAGE_SHIFT) should always be + @ zero. + @ +ARM( tst sp, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT) ) +THUMB( tst r0, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT) ) +THUMB( it ne ) + bne .Lstack_overflow_check\@ + + .pushsection .text +.Lstack_overflow_check\@: + @ + @ Either we've just detected an overflow, or we've taken an exception + @ while on the overflow stack. We cannot use the stack until we have + @ decided which is the case. However, as we won't return to userspace, + @ we can clobber some USR/SYS mode registers to free up GPRs. + @ + + mcr p15, 0, ip, c13, c0, 2 @ Stash IP in TPIDRURW + mrs ip, cpsr + eor ip, ip, #(SVC_MODE ^ SYSTEM_MODE) + msr cpsr_c, ip @ Switch to SYS mode + eor ip, ip, #(SVC_MODE ^ SYSTEM_MODE) + mov sp, ip @ Stash mode in SP_usr + + @ Load the overflow stack into IP using LR_usr as a scratch register + mov_l lr, overflow_stack + OVERFLOW_STACK_SIZE + mrc p15, 0, ip, c13, c0, 4 @ Get CPU offset + add ip, ip, lr @ IP := this CPU's overflow stack + mov lr, sp @ Unstash mode into LR_usr + msr cpsr_c, lr @ Switch back to SVC mode + + @ + @ Check whether we are already on the overflow stack. This may happen, + @ e.g., when performing accesses that may fault when dumping the stack. + @ The overflow stack is not in the vmalloc space so we only need to + @ check whether the incoming SP is below the top of the overflow stack. + @ +ARM( subs ip, sp, ip ) @ Delta with top of overflow stack +THUMB( subs ip, r0, ip ) + mrclo p15, 0, ip, c13, c0, 2 @ Restore IP + blo .Lout\@ @ Carry on + +THUMB( sub r0, sp, r0 ) @ Restore original R0 +THUMB( sub sp, r0 ) @ Restore original SP + sub sp, sp, ip @ Switch to overflow stack + add ip, sp, ip @ Keep incoming SP value in IP + add ip, ip, #\frame_size @ Undo svc_entry's SP change + b __bad_stack + .popsection +.Lout\@: +#endif + .endm
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index e05219bca218..5deb40f39999 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c@@ -56,7 +56,14 @@ static void __init init_irq_stacks(void) int cpu; for_each_possible_cpu(cpu) { - stack = (u8 *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER); + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + stack = (u8 *)__get_free_pages(GFP_KERNEL, + THREAD_SIZE_ORDER); + else + stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN, + THREADINFO_GFP, NUMA_NO_NODE, + __builtin_return_address(0)); + if (WARN_ON(!stack)) break; per_cpu(irq_stack_ptr, cpu) = &stack[THREAD_SIZE];
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index b42c446cec9a..eb8c73be7c81 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c@@ -121,7 +121,8 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl) static int verify_stack(unsigned long sp) { if (sp < PAGE_OFFSET || - (sp > (unsigned long)high_memory && high_memory != NULL)) + (!IS_ENABLED(CONFIG_VMAP_STACK) && + sp > (unsigned long)high_memory && high_memory != NULL)) return -EFAULT; return 0;
@@ -291,7 +292,8 @@ static int __die(const char *str, int err, struct pt_regs *regs) if (!user_mode(regs) || in_interrupt()) { dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp, - ALIGN(regs->ARM_sp, THREAD_SIZE)); + ALIGN(regs->ARM_sp - THREAD_SIZE, THREAD_ALIGN) + + THREAD_SIZE); dump_backtrace(regs, tsk, KERN_EMERG); dump_instr(KERN_EMERG, regs); }
@@ -838,3 +840,62 @@ void __init early_trap_init(void *vectors_base) */ #endif } + +#ifdef CONFIG_VMAP_STACK + +DECLARE_PER_CPU(u8 *, irq_stack_ptr); + +asmlinkage DEFINE_PER_CPU_ALIGNED(u8[OVERFLOW_STACK_SIZE], overflow_stack); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("Task stack: [0x%08lx..0x%08lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%08lx..0x%08lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%08lx..0x%08lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + die("kernel stack overflow", regs, 0); +} + +/* + * Normally, we rely on the logic in do_translation_fault() to update stale PMD + * entries covering the vmalloc space in a task's page tables when it first + * accesses the region in question. Unfortunately, this is not sufficient when + * the task stack resides in the vmalloc region, as do_translation_fault() is a + * C function that needs a stack to run. + * + * So we need to ensure that these PMD entries are up to date *before* the MM + * switch. As we already have some logic in the MM switch path that takes care + * of this, let's trigger it by bumping the counter every time the core vmalloc + * code modifies a PMD entry in the vmalloc region. + */ +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + if (start > VMALLOC_END || end < VMALLOC_START) + return; + + /* + * This hooks into the core vmalloc code to receive notifications of + * any PMD level changes that have been made to the kernel page tables. + * This means it should only be triggered once for every MiB worth of + * vmalloc space, given that we don't support huge vmalloc/vmap on ARM, + * and that kernel PMD level table entries are rarely (if ever) + * updated. + * + * This means that the counter is going to max out at ~250 for the + * typical case. If it overflows, something entirely unexpected has + * occurred so let's throw a warning if that happens. + */ + WARN_ON(++init_mm.context.vmalloc_seq == UINT_MAX); +} + +#endif
diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index e8d729975f12..c5ea328c428d 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c@@ -389,7 +389,8 @@ int unwind_frame(struct stackframe *frame) /* store the highest address on the stack to avoid crossing it*/ ctrl.sp_low = frame->sp; - ctrl.sp_high = ALIGN(ctrl.sp_low, THREAD_SIZE); + ctrl.sp_high = ALIGN(ctrl.sp_low - THREAD_SIZE, THREAD_ALIGN) + + THREAD_SIZE; pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__, frame->pc, frame->lr, frame->sp);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index f02d617e3359..aa12b65a7fd6 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S@@ -138,12 +138,12 @@ SECTIONS #ifdef CONFIG_STRICT_KERNEL_RWX . = ALIGN(1<<SECTION_SHIFT); #else - . = ALIGN(THREAD_SIZE); + . = ALIGN(THREAD_ALIGN); #endif __init_end = .; _sdata = .; - RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) _edata = .; BSS_SECTION(0, 0, 0)
--
2.30.2
_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel