[PATCH v17 02/16] preempt: Track NMI nesting to separate per-CPU counter
From: Lyude Paul <lyude@redhat.com>
Date: 2026-01-21 22:40:06
Also in:
lkml
Subsystem:
scheduler, the rest · Maintainers:
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Linus Torvalds
From: Joel Fernandes <joelagnelf@nvidia.com> Move NMI nesting tracking from the preempt_count bits to a separate per-CPU counter (nmi_nesting). This is to free up the NMI bits in the preempt_count, allowing those bits to be repurposed for other uses. This also has the benefit of tracking more than 16-levels deep if there is ever a need. Reduce multiple bits in preempt_count for NMI tracking. Reduce NMI_BITS from 3 to 1, using it only to detect if we're in an NMI. Suggested-by: Boqun Feng <redacted> Signed-off-by: Joel Fernandes <redacted> Signed-off-by: Lyude Paul <lyude@redhat.com> --- include/linux/hardirq.h | 16 ++++++++++++---- include/linux/preempt.h | 13 +++++++++---- kernel/softirq.c | 2 ++ 3 files changed, 23 insertions(+), 8 deletions(-)
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index d57cab4d4c06f..cc06bda52c3e5 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h@@ -10,6 +10,8 @@ #include <linux/vtime.h> #include <asm/hardirq.h> +DECLARE_PER_CPU(unsigned int, nmi_nesting); + extern void synchronize_irq(unsigned int irq); extern bool synchronize_hardirq(unsigned int irq);
@@ -102,14 +104,16 @@ void irq_exit_rcu(void); */ /* - * nmi_enter() can nest up to 15 times; see NMI_BITS. + * nmi_enter() can nest - nesting is tracked in a per-CPU counter. */ #define __nmi_enter() \ do { \ lockdep_off(); \ arch_nmi_enter(); \ - BUG_ON(in_nmi() == NMI_MASK); \ - __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ + BUG_ON(__this_cpu_read(nmi_nesting) == UINT_MAX); \ + __this_cpu_inc(nmi_nesting); \ + __preempt_count_add(HARDIRQ_OFFSET); \ + preempt_count_set(preempt_count() | NMI_MASK); \ } while (0) #define nmi_enter() \
@@ -124,8 +128,12 @@ void irq_exit_rcu(void); #define __nmi_exit() \ do { \ + unsigned int nesting; \ BUG_ON(!in_nmi()); \ - __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ + __preempt_count_sub(HARDIRQ_OFFSET); \ + nesting = __this_cpu_dec_return(nmi_nesting); \ + if (!nesting) \ + __preempt_count_sub(NMI_OFFSET); \ arch_nmi_exit(); \ lockdep_on(); \ } while (0)
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index f07e7f37f3ca5..e2d3079d3f5f1 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h@@ -18,6 +18,8 @@ * - bits 0-7 are the preemption count (max preemption depth: 256) * - bits 8-15 are the softirq count (max # of softirqs: 256) * - bits 16-23 are the hardirq disable count (max # of hardirq disable: 256) + * - bits 24-27 are the hardirq count (max # of hardirqs: 16) + * - bit 28 is the NMI flag (no nesting count, tracked separately) * * The hardirq count could in theory be the same as the number of * interrupts in the system, but we run all interrupt handlers with
@@ -25,18 +27,21 @@ * there are a few palaeontologic drivers which reenable interrupts in * the handler, so we need more than one bit here. * + * NMI nesting depth is tracked in a separate per-CPU variable + * (nmi_nesting) to save bits in preempt_count. + * * PREEMPT_MASK: 0x000000ff * SOFTIRQ_MASK: 0x0000ff00 * HARDIRQ_DISABLE_MASK: 0x00ff0000 - * HARDIRQ_MASK: 0x07000000 - * NMI_MASK: 0x38000000 + * HARDIRQ_MASK: 0x0f000000 + * NMI_MASK: 0x10000000 * PREEMPT_NEED_RESCHED: 0x80000000 */ #define PREEMPT_BITS 8 #define SOFTIRQ_BITS 8 #define HARDIRQ_DISABLE_BITS 8 -#define HARDIRQ_BITS 3 -#define NMI_BITS 3 +#define HARDIRQ_BITS 4 +#define NMI_BITS 1 #define PREEMPT_SHIFT 0 #define SOFTIRQ_SHIFT (PREEMPT_SHIFT + PREEMPT_BITS)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 77198911b8dd4..af47ea23aba3b 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c@@ -88,6 +88,8 @@ EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); #endif +DEFINE_PER_CPU(unsigned int, nmi_nesting); + /* * SOFTIRQ_OFFSET usage: *
--
2.52.0