Thread (20 messages) 20 messages, 5 authors, 2021-10-24

Re: [PATCH v2 7/7] ARM: implement support for vmap'ed stacks

From: Nick Desaulniers <hidden>
Date: 2021-10-18 21:01:31

On Mon, Oct 18, 2021 at 7:16 AM Ard Biesheuvel [off-list ref] wrote:
quoted hunk ↗ jump to hunk
Wire up the generic support for managing task stack allocations via vmalloc,
and implement the entry code that detects whether we faulted because of a
stack overrun (or future stack overrun caused by pushing the pt_regs array)

While this adds a fair amount of tricky entry asm code, it should be
noted that it only adds a TST + branch to the svc_entry path. The code
implementing the non-trivial handling of the overflow stack is emitted
out-of-line into the .text section.

Since on ARM, we rely on do_translation_fault() to keep PMD level page
table entries that cover the vmalloc region up to date, we need to
ensure that we don't hit such a stale PMD entry when accessing the
stack. So we do a dummy read from the new stack while still running from
the old one on the context switch path, and bump the vmalloc_seq counter
when PMD level entries in the vmalloc range are modified, so that the MM
switch fetches the latest version of the entries.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Tested-by: Keith Packard <redacted>
---
 arch/arm/Kconfig                   |  1 +
 arch/arm/include/asm/page.h        |  4 +
 arch/arm/include/asm/thread_info.h |  8 ++
 arch/arm/kernel/entry-armv.S       | 79 ++++++++++++++++++--
 arch/arm/kernel/entry-header.S     | 57 ++++++++++++++
 arch/arm/kernel/irq.c              |  9 ++-
 arch/arm/kernel/traps.c            | 65 +++++++++++++++-
 arch/arm/kernel/unwind.c           |  3 +-
 arch/arm/kernel/vmlinux.lds.S      |  4 +-
 9 files changed, 219 insertions(+), 11 deletions(-)
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index d46b243e1b26..964cc41a0fed 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -126,6 +126,7 @@ config ARM
        select RTC_LIB
        select SYS_SUPPORTS_APM_EMULATION
        select THREAD_INFO_IN_TASK if CURRENT_POINTER_IN_TPIDRURO
+       select HAVE_ARCH_VMAP_STACK if THREAD_INFO_IN_TASK
        select TRACE_IRQFLAGS_SUPPORT if !CPU_V7M
        # Above selects are sorted alphabetically; please add new ones
        # according to that.  Thanks.
diff --git a/arch/arm/include/asm/page.h b/arch/arm/include/asm/page.h
index 11b058a72a5b..7b871ed99ccf 100644
--- a/arch/arm/include/asm/page.h
+++ b/arch/arm/include/asm/page.h
@@ -149,6 +149,10 @@ extern void copy_page(void *to, const void *from);
 #include <asm/pgtable-2level-types.h>
 #endif

+#ifdef CONFIG_VMAP_STACK
+#define ARCH_PAGE_TABLE_SYNC_MASK      PGTBL_PMD_MODIFIED
+#endif
+
 #endif /* CONFIG_MMU */

 typedef struct page *pgtable_t;
diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h
index 787511396f3f..900c8ae484c9 100644
--- a/arch/arm/include/asm/thread_info.h
+++ b/arch/arm/include/asm/thread_info.h
@@ -25,6 +25,14 @@
 #define THREAD_SIZE            (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define THREAD_START_SP                (THREAD_SIZE - 8)

+#ifdef CONFIG_VMAP_STACK
+#define THREAD_ALIGN           (2 * THREAD_SIZE)
+#else
+#define THREAD_ALIGN           THREAD_SIZE
+#endif
+
+#define OVERFLOW_STACK_SIZE    SZ_4K
+
 #ifndef __ASSEMBLY__

 struct task_struct;
diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S
index 80afab1c344b..6910d6596e08 100644
--- a/arch/arm/kernel/entry-armv.S
+++ b/arch/arm/kernel/entry-armv.S
@@ -58,6 +58,14 @@ UNWIND(      .setfp  fpreg, sp               )
        @
        subs    r2, sp, r0              @ SP above bottom of IRQ stack?
        rsbscs  r2, r2, #THREAD_SIZE    @ ... and below the top?
+#ifdef CONFIG_VMAP_STACK
+       bcs     .L\@
+       mov_l   r2, overflow_stack      @ Take base address
+       add     r2, r2, r3              @ Top of this CPU's overflow stack
+       subs    r2, r0, r2              @ Compare with incoming SP
+       rsbscs  r2, r2, #OVERFLOW_STACK_SIZE
+.L\@:
+#endif
        movcs   sp, r0                  @ If so, revert to incoming SP

 #ifndef CONFIG_UNWINDER_ARM
@@ -189,13 +197,18 @@ ENDPROC(__und_invalid)
 #define SPFIX(code...)
 #endif

-       .macro  svc_entry, stack_hole=0, trace=1, uaccess=1
+       .macro  svc_entry, stack_hole=0, trace=1, uaccess=1, overflow_check=1
  UNWIND(.fnstart               )
- UNWIND(.save {r0 - pc}                )
        sub     sp, sp, #(SVC_REGS_SIZE + \stack_hole)
+ THUMB(        add     sp, r0          )       @ get SP in a GPR without
+ THUMB(        sub     r0, sp, r0      )       @ using a temp register
+
+       .if     \overflow_check
+ UNWIND(.save  {r0 - pc}       )
+       do_overflow_check (SVC_REGS_SIZE + \stack_hole)
+       .endif
+
 #ifdef CONFIG_THUMB2_KERNEL
-       add     sp, r0                  @ get SP in a GPR without
-       sub     r0, sp, r0              @ using a temp register
        tst     r0, #4                  @ test stack pointer alignment
        sub     r0, sp, r0              @ restore original R0
        sub     sp, r0                  @ restore original SP
@@ -828,12 +841,20 @@ ENTRY(__switch_to)
        str     r7, [r8]
 #endif
        mov     r0, r5
-#if !defined(CONFIG_THUMB2_KERNEL)
+#if !defined(CONFIG_THUMB2_KERNEL) && !defined(CONFIG_VMAP_STACK)
        set_current r7
        ldmia   r4, {r4 - sl, fp, sp, pc}       @ Load all regs saved previously
 #else
        mov     r1, r7
        ldmia   r4, {r4 - sl, fp, ip, lr}       @ Load all regs saved previously
+#ifdef CONFIG_VMAP_STACK
+       @
+       @ Do a dummy read from the new stack while running from the old one so
+       @ that we can rely on do_translation_fault() to fix up any stale PMD
+       @ entries covering the vmalloc region.
+       @
+       ldr     r2, [ip]
+#endif
        set_current r1
        mov     sp, ip
        ret     lr
@@ -841,6 +862,54 @@ ENTRY(__switch_to)
  UNWIND(.fnend         )
 ENDPROC(__switch_to)

+#ifdef CONFIG_VMAP_STACK
+       .text
+__bad_stack:
+       @
+       @ We detected an overflow in svc_entry, which switched to the
+       @ overflow stack. Stash the exception regs, and head to our overflow
+       @ handler. Entered with the orginal value of SP in IP, and the original
+       @ value of IP in TPIDRURW
+       @
+
+#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
+       mov     ip, ip                          @ mov expected by unwinder
+       push    {fp, ip, lr, pc}                @ GCC flavor frame record
+#else
+       str     ip, [sp, #-8]!                  @ store original SP
+       push    {fpreg, lr}                     @ Clang flavor frame record
+#endif
+UNWIND( ldr    ip, [r0, #4]    )               @ load exception LR
+UNWIND( str    ip, [sp, #12]   )               @ store in the frame record
+       mrc     p15, 0, ip, c13, c0, 2          @ reload IP
+
+       @ Store the original GPRs to the new stack.
+       svc_entry uaccess=0, overflow_check=0
+
+UNWIND( .save   {sp, pc}       )
+UNWIND( .save   {fpreg, lr}    )
+UNWIND( .setfp  fpreg, sp      )
+
+       ldr     fpreg, [sp, #S_SP]              @ Add our frame record
+                                               @ to the linked list
+#if defined(CONFIG_UNWINDER_FRAME_POINTER) && defined(CONFIG_CC_IS_GCC)
+       ldr     r1, [fp, #4]                    @ reload SP at entry
+       add     fp, fp, #12
+#else
+       ldr     r1, [fpreg, #8]
+#endif
+       str     r1, [sp, #S_SP]                 @ store in pt_regs
+
+       @ Stash the regs for handle_bad_stack
+       mov     r0, sp
+
+       @ Time to die
+       bl      handle_bad_stack
+       nop
+UNWIND( .fnend                 )
+ENDPROC(__bad_stack)
+#endif
+
        __INIT

 /*
diff --git a/arch/arm/kernel/entry-header.S b/arch/arm/kernel/entry-header.S
index ae24dd54e9ef..823dd1aa6e3e 100644
--- a/arch/arm/kernel/entry-header.S
+++ b/arch/arm/kernel/entry-header.S
@@ -423,3 +423,60 @@ scno       .req    r7              @ syscall number
 tbl    .req    r8              @ syscall table pointer
 why    .req    r8              @ Linux syscall (!= 0)
 tsk    .req    r9              @ current thread_info
+
+       .macro  do_overflow_check, frame_size:req
+#ifdef CONFIG_VMAP_STACK
+       @
+       @ Test whether the SP has overflowed. Task and IRQ stacks are aligned
+       @ so that SP & BIT(THREAD_SIZE_ORDER + PAGE_SHIFT) should always be
+       @ zero.
+       @
+ARM(   tst     sp, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)      )
+THUMB( tst     r0, #1 << (THREAD_SIZE_ORDER + PAGE_SHIFT)      )
+THUMB( it      ne                                              )
+       bne     .Lstack_overflow_check\@
+
+       .pushsection    .text
+.Lstack_overflow_check\@:
+       @
+       @ Either we've just detected an overflow, or we've taken an exception
+       @ while on the overflow stack. We cannot use the stack until we have
+       @ decided which is the case. However, as we won't return to userspace,
+       @ we can clobber some USR/SYS mode registers to free up GPRs.
+       @
+
+       mcr     p15, 0, ip, c13, c0, 2          @ Stash IP in TPIDRURW
+       mrs     ip, cpsr
+       eor     ip, ip, #(SVC_MODE ^ SYSTEM_MODE)
+       msr     cpsr_c, ip                      @ Switch to SYS mode
+       eor     ip, ip, #(SVC_MODE ^ SYSTEM_MODE)
+       mov     sp, ip                          @ Stash mode in SP_usr
+
+       @ Load the overflow stack into IP using LR_usr as a scratch register
+       mov_l   lr, overflow_stack + OVERFLOW_STACK_SIZE
+       mrc     p15, 0, ip, c13, c0, 4          @ Get CPU offset
+       add     ip, ip, lr                      @ IP := this CPU's overflow stack
+       mov     lr, sp                          @ Unstash mode into LR_usr
+       msr     cpsr_c, lr                      @ Switch back to SVC mode
+
+       @
+       @ Check whether we are already on the overflow stack. This may happen,
+       @ e.g., when performing accesses that may fault when dumping the stack.
+       @ The overflow stack is not in the vmalloc space so we only need to
+       @ check whether the incoming SP is below the top of the overflow stack.
+       @
+ARM(   subs    ip, sp, ip              )       @ Delta with top of overflow stack
+THUMB( subs    ip, r0, ip              )
+       mrclo   p15, 0, ip, c13, c0, 2          @ Restore IP
+       blo     .Lout\@                         @ Carry on
+
+THUMB( sub     r0, sp, r0              )       @ Restore original R0
+THUMB( sub     sp, r0                  )       @ Restore original SP
+       sub     sp, sp, ip                      @ Switch to overflow stack
+       add     ip, sp, ip                      @ Keep incoming SP value in IP
+       add     ip, ip, #\frame_size            @ Undo svc_entry's SP change
+       b       __bad_stack
+       .popsection
+.Lout\@:
+#endif
+       .endm
diff --git a/arch/arm/kernel/irq.c b/arch/arm/kernel/irq.c
index bab3159b9074..25eacfd0f439 100644
--- a/arch/arm/kernel/irq.c
+++ b/arch/arm/kernel/irq.c
@@ -56,7 +56,14 @@ static void __init init_irq_stacks(void)
        int cpu;

        for_each_possible_cpu(cpu) {
-               stack = (u8 *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+               if (!IS_ENABLED(CONFIG_VMAP_STACK))
+                       stack = (u8 *)__get_free_pages(GFP_KERNEL,
+                                                      THREAD_SIZE_ORDER);
+               else
+                       stack = __vmalloc_node(THREAD_SIZE, THREAD_ALIGN,
+                                              THREADINFO_GFP, NUMA_NO_NODE,
+                                              __builtin_return_address(0));
Thanks for the improved clang support in v3. The frame differences
LGTM. The only feedback I had was here, and a truly minor nit.

Avoid:

if (!foo)
  bar();
else
  baz();

Prefer:

if (foo)
  baz();
else
  bar();
quoted hunk ↗ jump to hunk
+
                if (WARN_ON(!stack))
                        break;
                per_cpu(irq_stack_ptr, cpu) = &stack[THREAD_SIZE];
diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c
index 1ba9fc2eb564..32fd3fee808d 100644
--- a/arch/arm/kernel/traps.c
+++ b/arch/arm/kernel/traps.c
@@ -121,7 +121,8 @@ void dump_backtrace_stm(u32 *stack, u32 instruction, const char *loglvl)
 static int verify_stack(unsigned long sp)
 {
        if (sp < PAGE_OFFSET ||
-           (sp > (unsigned long)high_memory && high_memory != NULL))
+           (!IS_ENABLED(CONFIG_VMAP_STACK) &&
+            sp > (unsigned long)high_memory && high_memory != NULL))
                return -EFAULT;

        return 0;
@@ -291,7 +292,8 @@ static int __die(const char *str, int err, struct pt_regs *regs)

        if (!user_mode(regs) || in_interrupt()) {
                dump_mem(KERN_EMERG, "Stack: ", regs->ARM_sp,
-                        ALIGN(regs->ARM_sp, THREAD_SIZE));
+                        ALIGN(regs->ARM_sp - THREAD_SIZE, THREAD_ALIGN)
+                        + THREAD_SIZE);
                dump_backtrace(regs, tsk, KERN_EMERG);
                dump_instr(KERN_EMERG, regs);
        }
@@ -838,3 +840,62 @@ void __init early_trap_init(void *vectors_base)
         */
 #endif
 }
+
+#ifdef CONFIG_VMAP_STACK
+
+DECLARE_PER_CPU(u8 *, irq_stack_ptr);
+
+asmlinkage DEFINE_PER_CPU_ALIGNED(u8[OVERFLOW_STACK_SIZE], overflow_stack);
+
+asmlinkage void handle_bad_stack(struct pt_regs *regs)
+{
+       unsigned long tsk_stk = (unsigned long)current->stack;
+       unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr);
+       unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack);
+
+       console_verbose();
+       pr_emerg("Insufficient stack space to handle exception!");
+
+       pr_emerg("Task stack:     [0x%08lx..0x%08lx]\n",
+                tsk_stk, tsk_stk + THREAD_SIZE);
+       pr_emerg("IRQ stack:      [0x%08lx..0x%08lx]\n",
+                irq_stk, irq_stk + THREAD_SIZE);
+       pr_emerg("Overflow stack: [0x%08lx..0x%08lx]\n",
+                ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE);
+
+       die("kernel stack overflow", regs, 0);
+}
+
+/*
+ * Normally, we rely on the logic in do_translation_fault() to update stale PMD
+ * entries covering the vmalloc space in a task's page tables when it first
+ * accesses the region in question. Unfortunately, this is not sufficient when
+ * the task stack resides in the vmalloc region, as do_translation_fault() is a
+ * C function that needs a stack to run.
+ *
+ * So we need to ensure that these PMD entries are up to date *before* the MM
+ * switch. As we already have some logic in the MM switch path that takes care
+ * of this, let's trigger it by bumping the counter every time the core vmalloc
+ * code modifies a PMD entry in the vmalloc region.
+ */
+void arch_sync_kernel_mappings(unsigned long start, unsigned long end)
+{
+       if (start > VMALLOC_END || end < VMALLOC_START)
+               return;
+
+       /*
+        * This hooks into the core vmalloc code to receive notifications of
+        * any PMD level changes that have been made to the kernel page tables.
+        * This means it should only be triggered once for every MiB worth of
+        * vmalloc space, given that we don't support huge vmalloc/vmap on ARM,
+        * and that kernel PMD level table entries are rarely (if ever)
+        * updated.
+        *
+        * This means that the counter is going to max out at ~250 for the
+        * typical case. If it overflows, something entirely unexpected has
+        * occurred so let's throw a warning if that happens.
+        */
+       WARN_ON(++init_mm.context.vmalloc_seq == UINT_MAX);
+}
+
+#endif
diff --git a/arch/arm/kernel/unwind.c b/arch/arm/kernel/unwind.c
index e8d729975f12..c5ea328c428d 100644
--- a/arch/arm/kernel/unwind.c
+++ b/arch/arm/kernel/unwind.c
@@ -389,7 +389,8 @@ int unwind_frame(struct stackframe *frame)

        /* store the highest address on the stack to avoid crossing it*/
        ctrl.sp_low = frame->sp;
-       ctrl.sp_high = ALIGN(ctrl.sp_low, THREAD_SIZE);
+       ctrl.sp_high = ALIGN(ctrl.sp_low - THREAD_SIZE, THREAD_ALIGN)
+                      + THREAD_SIZE;

        pr_debug("%s(pc = %08lx lr = %08lx sp = %08lx)\n", __func__,
                 frame->pc, frame->lr, frame->sp);
diff --git a/arch/arm/kernel/vmlinux.lds.S b/arch/arm/kernel/vmlinux.lds.S
index 20c4f6d20c7a..924cde084293 100644
--- a/arch/arm/kernel/vmlinux.lds.S
+++ b/arch/arm/kernel/vmlinux.lds.S
@@ -138,12 +138,12 @@ SECTIONS
 #ifdef CONFIG_STRICT_KERNEL_RWX
        . = ALIGN(1<<SECTION_SHIFT);
 #else
-       . = ALIGN(THREAD_SIZE);
+       . = ALIGN(THREAD_ALIGN);
 #endif
        __init_end = .;

        _sdata = .;
-       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE)
+       RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN)
        _edata = .;

        BSS_SECTION(0, 0, 0)
--
2.30.2

-- 
Thanks,
~Nick Desaulniers

_______________________________________________
linux-arm-kernel mailing list
linux-arm-kernel@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help