Re: [PATCH] x86/dumpstack/64: Add guard pages to stack_info
From: 王贇 <hidden>
Date: 2021-09-17 02:15:35
Also in:
bpf, linux-perf-users, lkml
On 2021/9/16 下午6:02, Peter Zijlstra wrote:
On Thu, Sep 16, 2021 at 10:03:19AM +0200, Peter Zijlstra wrote:quoted
Oh, I'm an idiot... yes it tries to read regs the stack, but clearly that won't work for the guard page.OK, extended it to also cover task and IRQ stacks. get_stack_info() doesn't seem to know about SOFTIRQ stacks on 64bit, might have to look into that next. Andy, what's the story with page_fault_oops(), according to the comment in exc_double_fault() actual stack overflows will always hit #DF.
Just give this one a test, still not working properly... [ 51.016033][ C0] traps: PANIC: double fault, error_code: 0x0 [ 51.016047][ C0] double fault: 0000 [#1] SMP PTI [ 51.016054][ C0] CPU: 0 PID: 761 Comm: a.out Not tainted 5.14.0-next-20210913+ #543 [ 51.016061][ C0] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 51.016065][ C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70 [ 51.016079][ C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e [ 51.016086][ C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046 [ 51.016093][ C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000 [ 51.016097][ C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c [ 51.016102][ C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000 [ 51.016106][ C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 51.016109][ C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001 [ 51.016113][ C0] FS: 00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 51.016120][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 51.016124][ C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0 [ 51.016129][ C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 51.016132][ C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 51.016136][ C0] Call Trace: [ 51.016139][ C0] <TASK> [ 51.016141][ C0] </TASK> [ 51.016144][ C0] Modules linked in: [ 51.042436][ C0] ---[ end trace 5c102ce76b073dcf ]--- [ 51.042440][ C0] RIP: 0010:perf_swevent_get_recursion_context+0x0/0x70 [ 51.042450][ C0] Code: 48 03 43 28 48 8b 0c 24 bb 01 00 00 00 4c 29 f0 48 39 c8 48 0f 47 c1 49 89 45 08 e9 48 ff ff ff 66 2e 0f 1f 84 00 00 00 00 00 <55> 53 e8 09 20 f2 ff 48 c7 c2 20 4d 03 00 65 48 03 15 5a 3b d2 7e [ 51.042457][ C0] RSP: 0018:fffffe000000b000 EFLAGS: 00010046 [ 51.042462][ C0] RAX: 0000000080120008 RBX: fffffe000000b050 RCX: 0000000000000000 [ 51.042466][ C0] RDX: ffff888106c3c300 RSI: ffffffff81269031 RDI: 000000000000001c [ 51.042470][ C0] RBP: 000000000000001c R08: 0000000000000001 R09: 0000000000000000 [ 51.042479][ C0] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 [ 51.042483][ C0] R13: fffffe000000b044 R14: 0000000000000001 R15: 0000000000000001 [ 51.042487][ C0] FS: 00007f0cfd961740(0000) GS:ffff88813bc00000(0000) knlGS:0000000000000000 [ 51.042493][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 51.042497][ C0] CR2: fffffe000000aff8 CR3: 0000000105ecc001 CR4: 00000000003606f0 [ 51.042501][ C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 51.042505][ C0] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 51.042510][ C0] Kernel panic - not syncing: Fatal exception in interrupt [ 51.042917][ C0] Kernel Offset: disabled Regards, Michael Wang
quoted hunk ↗ jump to hunk
---diff --git a/arch/x86/include/asm/cpu_entry_area.h b/arch/x86/include/asm/cpu_entry_area.h index 3d52b094850a..c4e92462c2b4 100644 --- a/arch/x86/include/asm/cpu_entry_area.h +++ b/arch/x86/include/asm/cpu_entry_area.h@@ -61,6 +61,9 @@ enum exception_stack_ordering { #define CEA_ESTACK_OFFS(st) \ offsetof(struct cea_exception_stacks, st## _stack) +#define CEA_EGUARD_OFFS(st) \ + offsetof(struct cea_exception_stacks, st## _stack_guard) + #define CEA_ESTACK_PAGES \ (sizeof(struct cea_exception_stacks) / PAGE_SIZE)diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index f248eb2ac2d4..8ff346579330 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h@@ -14,13 +14,14 @@ #include <asm/switch_to.h> enum stack_type { - STACK_TYPE_UNKNOWN, + STACK_TYPE_UNKNOWN = 0, STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, + STACK_TYPE_GUARD = 0x80, }; struct stack_info {@@ -31,6 +32,15 @@ struct stack_info { bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); +static __always_inline bool in_stack_guard(void *addr, void *begin, void *end) +{ +#ifdef CONFIG_VMAP_STACK + if (addr > (begin - PAGE_SIZE)) + return true; +#endif + return false; +} + bool in_entry_stack(unsigned long *stack, struct stack_info *info); int get_stack_info(unsigned long *stack, struct task_struct *task,diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ea4fe192189d..91b406fe2a39 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c@@ -32,12 +32,19 @@ static struct pt_regs exec_summary_regs; bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info) { - unsigned long *begin = task_stack_page(task); - unsigned long *end = task_stack_page(task) + THREAD_SIZE; - - if (stack < begin || stack >= end) + void *begin = task_stack_page(task); + void *end = begin + THREAD_SIZE; + int type = STACK_TYPE_TASK; + + if ((void *)stack < begin || (void *)stack >= end) { + if (in_stack_guard(stack, begin, end)) { + type |= STACK_TYPE_GUARD; + goto fill_info; + } return false; + } +fill_info: info->type = STACK_TYPE_TASK; info->begin = begin; info->end = end;@@ -50,14 +57,20 @@ bool noinstr in_task_stack(unsigned long *stack, struct task_struct *task, bool noinstr in_entry_stack(unsigned long *stack, struct stack_info *info) { struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); - + int type = STACK_TYPE_ENTRY; void *begin = ss; void *end = ss + 1; - if ((void *)stack < begin || (void *)stack >= end) + if ((void *)stack < begin || (void *)stack >= end) { + if (in_stack_guard(stack, begin, end)) { + type |= STACK_TYPE_GUARD; + goto fill_info; + } return false; + } - info->type = STACK_TYPE_ENTRY; +fill_info: + info->type = type; info->begin = begin; info->end = end; info->next_sp = NULL;diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 5601b95944fa..3634bdf9ab36 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c@@ -32,9 +32,15 @@ const char *stack_type_name(enum stack_type type) { BUILD_BUG_ON(N_EXCEPTION_STACKS != 6); + if (type == STACK_TYPE_TASK) + return "TASK"; + if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_SOFTIRQ) + return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) { /* * On 64-bit, we have a generic entry stack that we@@ -63,6 +69,11 @@ struct estack_pages { }; #define EPAGERANGE(st) \ + [PFN_DOWN(CEA_EGUARD_OFFS(st))] = { \ + .offs = CEA_EGUARD_OFFS(st), \ + .size = PAGE_SIZE, \ + .type = STACK_TYPE_GUARD + \ + STACK_TYPE_EXCEPTION + ESTACK_ ##st, }, \ [PFN_DOWN(CEA_ESTACK_OFFS(st)) ... \ PFN_DOWN(CEA_ESTACK_OFFS(st) + CEA_ESTACK_SIZE(st) - 1)] = { \ .offs = CEA_ESTACK_OFFS(st), \@@ -111,7 +122,7 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac k = (stk - begin) >> PAGE_SHIFT; /* Lookup the page descriptor */ ep = &estack_pages[k]; - /* Guard page? */ + /* unknown entry */ if (!ep->size) return false;@@ -122,7 +133,12 @@ static __always_inline bool in_exception_stack(unsigned long *stack, struct stac info->type = ep->type; info->begin = (unsigned long *)begin; info->end = (unsigned long *)end; - info->next_sp = (unsigned long *)regs->sp; + info->next_sp = NULL; + + /* Can't read regs from a guard page. */ + if (!(ep->type & STACK_TYPE_GUARD)) + info->next_sp = (unsigned long *)regs->sp; + return true; }@@ -130,6 +146,7 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info { unsigned long *end = (unsigned long *)this_cpu_read(hardirq_stack_ptr); unsigned long *begin; + int type = STACK_TYPE_IRQ; /* * @end points directly to the top most stack entry to avoid a -8@@ -144,19 +161,27 @@ static __always_inline bool in_irq_stack(unsigned long *stack, struct stack_info * final operation is 'popq %rsp' which means after that RSP points * to the original stack and not to @end. */ - if (stack < begin || stack >= end) + if (stack < begin || stack >= end) { + if (in_stack_guard(stack, begin, end)) { + type |= STACK_TYPE_GUARD; + goto fill_info; + } return false; + } - info->type = STACK_TYPE_IRQ; +fill_info: + info->type = type; info->begin = begin; info->end = end; + info->next_sp = NULL; /* * The next stack pointer is stored at the top of the irq stack * before switching to the irq stack. Actual stack entries are all * below that. */ - info->next_sp = (unsigned long *)*(end - 1); + if (!(type & STACK_TYPE_GUARD)) + info->next_sp = (unsigned long *)*(end - 1); return true; }@@ -193,6 +218,9 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, if (!get_stack_info_noinstr(stack, task, info)) goto unknown; + if (info->type & STACK_TYPE_GUARD) + goto unknown; + /* * Make sure we don't iterate through any given stack more than once. * If it comes up a second time then there's something wrong going on:diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a58800973aed..80f6d8d735eb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c@@ -353,6 +353,7 @@ DEFINE_IDTENTRY_DF(exc_double_fault) #ifdef CONFIG_VMAP_STACK unsigned long address = read_cr2(); + struct stack_info info; #endif #ifdef CONFIG_X86_ESPFIX64@@ -455,9 +456,11 @@ DEFINE_IDTENTRY_DF(exc_double_fault) * stack even if the actual trigger for the double fault was * something else. */ - if ((unsigned long)task_stack_page(tsk) - 1 - address < PAGE_SIZE) { - handle_stack_overflow("kernel stack overflow (double-fault)", - regs, address); + if (get_stack_info_noinstr((void *)address, current, &info) && + info.type & STACK_TYPE_GUARD) { + const char *name = stack_type_name(info.type & ~STACK_TYPE_GUARD); + pr_emerg("BUG: %s stack guard hit at %p (stack is %p..%p)\n", + name, (void *)address, info.begin, info.end); } #endif@@ -708,7 +711,9 @@ asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct pt_regs *r sp = regs->sp; stack = (unsigned long *)sp; - if (!get_stack_info_noinstr(stack, current, &info) || info.type == STACK_TYPE_ENTRY || + if (!get_stack_info_noinstr(stack, current, &info) || + info.type & STACK_TYPE_GUARD || + info.type == STACK_TYPE_ENTRY || info.type >= STACK_TYPE_EXCEPTION_LAST) sp = __this_cpu_ist_top_va(VC2);