Re: [PATCH v2 3/4] sched/isolation: Add HK_TYPE_WQ to isolcpus=domain
From: Frederic Weisbecker <frederic@kernel.org>
Date: 2022-10-14 13:24:28
Also in:
linux-crypto, linux-pci, lkml
Subsystem:
cpu hotplug, kernel virtual machine for x86 (kvm/x86), networking [general], pci subsystem, read-copy update (rcu), scheduler, the rest, workqueue, x86 architecture (32-bit and 64-bit) · Maintainers:
Thomas Gleixner, Peter Zijlstra, Sean Christopherson, Paolo Bonzini, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Bjorn Helgaas, "Paul E. McKenney", Frederic Weisbecker, Neeraj Upadhyay, Joel Fernandes, Josh Triplett, Boqun Feng, Uladzislau Rezki, Ingo Molnar, Juri Lelli, Vincent Guittot, Linus Torvalds, Tejun Heo, Borislav Petkov, Dave Hansen
On Fri, Oct 14, 2022 at 10:36:19AM +0200, Peter Zijlstra wrote:
+ Frederic; who actually does most of this code On Thu, Oct 13, 2022 at 03:40:28PM -0300, Leonardo Bras wrote:quoted
Housekeeping code keeps multiple cpumasks in order to keep track of which cpus can perform given housekeeping category. Every time the HK_TYPE_WQ cpumask is checked before queueing work at a cpu WQ it also happens to check for HK_TYPE_DOMAIN. So It can be assumed that the Domain isolation also ends up isolating work queues. Delegating current HK_TYPE_DOMAIN's work queue isolation to HK_TYPE_WQ makes it simpler to check if a cpu can run a task into an work queue, since code just need to go through a single HK_TYPE_* cpumask. Make isolcpus=domain aggregate both HK_TYPE_DOMAIN and HK_TYPE_WQ, and remove a lot of cpumask_and calls. Also, remove a unnecessary '|=' at housekeeping_isolcpus_setup() since we are sure that 'flags == 0' here. Signed-off-by: Leonardo Bras <redacted>I've long maintained that having all these separate masks is daft; Frederic do we really need that?
Indeed. In my queue for the cpuset interface to nohz_full, I have the following patch (but note DOMAIN and WQ have to stay seperate flags because workqueue affinity can be modified seperately from isolcpus) --- From: Frederic Weisbecker <frederic@kernel.org> Date: Tue, 26 Jul 2022 17:03:30 +0200 Subject: [PATCH] sched/isolation: Gather nohz_full related isolation features into common flag Signed-off-by: Frederic Weisbecker <frederic@kernel.org> --- arch/x86/kvm/x86.c | 2 +- drivers/pci/pci-driver.c | 2 +- include/linux/sched/isolation.h | 7 +------ kernel/cpu.c | 4 ++-- kernel/kthread.c | 4 ++-- kernel/rcu/tasks.h | 2 +- kernel/rcu/tree_plugin.h | 6 +++--- kernel/sched/core.c | 10 +++++----- kernel/sched/fair.c | 6 +++--- kernel/sched/isolation.c | 25 +++++++------------------ kernel/watchdog.c | 2 +- kernel/workqueue.c | 2 +- net/core/net-sysfs.c | 2 +- 13 files changed, 29 insertions(+), 45 deletions(-)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 1910e1e78b15..d0b73fcf4a1c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c@@ -9009,7 +9009,7 @@ int kvm_arch_init(void *opaque) } if (pi_inject_timer == -1) - pi_inject_timer = housekeeping_enabled(HK_TYPE_TIMER); + pi_inject_timer = housekeeping_enabled(HK_TYPE_NOHZ_FULL); #ifdef CONFIG_X86_64 pvclock_gtod_register_notifier(&pvclock_gtod_notifier);
diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c
index 49238ddd39ee..af3494a39921 100644
--- a/drivers/pci/pci-driver.c
+++ b/drivers/pci/pci-driver.c@@ -378,7 +378,7 @@ static int pci_call_probe(struct pci_driver *drv, struct pci_dev *dev, goto out; } cpumask_and(wq_domain_mask, - housekeeping_cpumask(HK_TYPE_WQ), + housekeeping_cpumask(HK_TYPE_NOHZ_FULL), housekeeping_cpumask(HK_TYPE_DOMAIN)); cpu = cpumask_any_and(cpumask_of_node(node),
diff --git a/include/linux/sched/isolation.h b/include/linux/sched/isolation.h
index 8c15abd67aed..7ca34e04abe7 100644
--- a/include/linux/sched/isolation.h
+++ b/include/linux/sched/isolation.h@@ -6,15 +6,10 @@ #include <linux/tick.h> enum hk_type { - HK_TYPE_TIMER, - HK_TYPE_RCU, - HK_TYPE_MISC, + HK_TYPE_NOHZ_FULL, HK_TYPE_SCHED, - HK_TYPE_TICK, HK_TYPE_DOMAIN, - HK_TYPE_WQ, HK_TYPE_MANAGED_IRQ, - HK_TYPE_KTHREAD, HK_TYPE_MAX };
diff --git a/kernel/cpu.c b/kernel/cpu.c
index bbad5e375d3b..573f14d75a2e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c@@ -1500,8 +1500,8 @@ int freeze_secondary_cpus(int primary) cpu_maps_update_begin(); if (primary == -1) { primary = cpumask_first(cpu_online_mask); - if (!housekeeping_cpu(primary, HK_TYPE_TIMER)) - primary = housekeeping_any_cpu(HK_TYPE_TIMER); + if (!housekeeping_cpu(primary, HK_TYPE_NOHZ_FULL)) + primary = housekeeping_any_cpu(HK_TYPE_NOHZ_FULL); } else { if (!cpu_online(primary)) primary = cpumask_first(cpu_online_mask);
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 544fd4097406..0719035feba0 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c@@ -355,7 +355,7 @@ static int kthread(void *_create) * back to default in case they have been changed. */ sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_KTHREAD)); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE);
@@ -721,7 +721,7 @@ int kthreadd(void *unused) /* Setup a clean context for our children to inherit. */ set_task_comm(tsk, "kthreadd"); ignore_signals(tsk); - set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_KTHREAD)); + set_cpus_allowed_ptr(tsk, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); set_mems_allowed(node_states[N_MEMORY]); current->flags |= PF_NOFREEZE;
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index f5bf6fb430da..b99f79625b26 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h@@ -537,7 +537,7 @@ static int __noreturn rcu_tasks_kthread(void *arg) struct rcu_tasks *rtp = arg; /* Run on housekeeping CPUs by default. Sysadm can move if desired. */ - housekeeping_affine(current, HK_TYPE_RCU); + housekeeping_affine(current, HK_TYPE_NOHZ_FULL); WRITE_ONCE(rtp->kthread_ptr, current); // Let GPs start! /*
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index b2219577fbe2..4935b06c3caf 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h@@ -1237,9 +1237,9 @@ static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) if ((mask & leaf_node_cpu_bit(rnp, cpu)) && cpu != outgoingcpu) cpumask_set_cpu(cpu, cm); - cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_RCU)); + cpumask_and(cm, cm, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); if (cpumask_empty(cm)) - cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_RCU)); + cpumask_copy(cm, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); set_cpus_allowed_ptr(t, cm); mutex_unlock(&rnp->boost_kthread_mutex); free_cpumask_var(cm);
@@ -1294,5 +1294,5 @@ static void rcu_bind_gp_kthread(void) { if (!tick_nohz_full_enabled()) return; - housekeeping_affine(current, HK_TYPE_RCU); + housekeeping_affine(current, HK_TYPE_NOHZ_FULL); }
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index f53c0096860b..5ff205f39197 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c@@ -1079,13 +1079,13 @@ int get_nohz_timer_target(void) struct sched_domain *sd; const struct cpumask *hk_mask; - if (housekeeping_cpu(cpu, HK_TYPE_TIMER)) { + if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL)) { if (!idle_cpu(cpu)) return cpu; default_cpu = cpu; } - hk_mask = housekeeping_cpumask(HK_TYPE_TIMER); + hk_mask = housekeeping_cpumask(HK_TYPE_NOHZ_FULL); rcu_read_lock(); for_each_domain(cpu, sd) {
@@ -1101,7 +1101,7 @@ int get_nohz_timer_target(void) } if (default_cpu == -1) - default_cpu = housekeeping_any_cpu(HK_TYPE_TIMER); + default_cpu = housekeeping_any_cpu(HK_TYPE_NOHZ_FULL); cpu = default_cpu; unlock: rcu_read_unlock();
@@ -5562,7 +5562,7 @@ static void sched_tick_start(int cpu) int os; struct tick_work *twork; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL)) return; WARN_ON_ONCE(!tick_work_cpu);
@@ -5583,7 +5583,7 @@ static void sched_tick_stop(int cpu) struct tick_work *twork; int os; - if (housekeeping_cpu(cpu, HK_TYPE_TICK)) + if (housekeeping_cpu(cpu, HK_TYPE_NOHZ_FULL)) return; WARN_ON_ONCE(!tick_work_cpu);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77b2048a9326..ac3b33e00451 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c@@ -10375,7 +10375,7 @@ static inline int on_null_domain(struct rq *rq) * - When one of the busy CPUs notice that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set + * - HK_TYPE_NOHZ_FULL CPUs are used for this task, because HK_TYPE_SCHED not set * anywhere yet. */
@@ -10384,7 +10384,7 @@ static inline int find_new_ilb(void) int ilb; const struct cpumask *hk_mask; - hk_mask = housekeeping_cpumask(HK_TYPE_MISC); + hk_mask = housekeeping_cpumask(HK_TYPE_NOHZ_FULL); for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
@@ -10400,7 +10400,7 @@ static inline int find_new_ilb(void) /* * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * idle CPU in the HK_TYPE_NOHZ_FULL housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) {
diff --git a/kernel/sched/isolation.c b/kernel/sched/isolation.c
index 4087718ee5b4..443f1ce83e32 100644
--- a/kernel/sched/isolation.c
+++ b/kernel/sched/isolation.c@@ -4,20 +4,15 @@ * any CPU: unbound workqueues, timers, kthreads and any offloadable work. * * Copyright (C) 2017 Red Hat, Inc., Frederic Weisbecker - * Copyright (C) 2017-2018 SUSE, Frederic Weisbecker + * Copyright (C) 2017-2022 SUSE, Frederic Weisbecker * */ enum hk_flags { - HK_FLAG_TIMER = BIT(HK_TYPE_TIMER), - HK_FLAG_RCU = BIT(HK_TYPE_RCU), - HK_FLAG_MISC = BIT(HK_TYPE_MISC), + HK_FLAG_NOHZ_FULL = BIT(HK_TYPE_NOHZ_FULL), HK_FLAG_SCHED = BIT(HK_TYPE_SCHED), - HK_FLAG_TICK = BIT(HK_TYPE_TICK), HK_FLAG_DOMAIN = BIT(HK_TYPE_DOMAIN), - HK_FLAG_WQ = BIT(HK_TYPE_WQ), HK_FLAG_MANAGED_IRQ = BIT(HK_TYPE_MANAGED_IRQ), - HK_FLAG_KTHREAD = BIT(HK_TYPE_KTHREAD), }; DEFINE_STATIC_KEY_FALSE(housekeeping_overridden);
@@ -88,7 +83,7 @@ void __init housekeeping_init(void) static_branch_enable(&housekeeping_overridden); - if (housekeeping.flags & HK_FLAG_TICK) + if (housekeeping.flags & HK_FLAG_NOHZ_FULL) sched_tick_offload_init(); for_each_set_bit(type, &housekeeping.flags, HK_TYPE_MAX) {
@@ -111,7 +106,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) cpumask_var_t non_housekeeping_mask, housekeeping_staging; int err = 0; - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) { + if ((flags & HK_FLAG_NOHZ_FULL) && !(housekeeping.flags & HK_FLAG_NOHZ_FULL)) { if (!IS_ENABLED(CONFIG_NO_HZ_FULL)) { pr_warn("Housekeeping: nohz unsupported." " Build with CONFIG_NO_HZ_FULL\n");
@@ -163,7 +158,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) housekeeping_setup_type(type, housekeeping_staging); } - if ((flags & HK_FLAG_TICK) && !(housekeeping.flags & HK_FLAG_TICK)) + if ((flags & HK_FLAG_NOHZ_FULL) && !(housekeeping.flags & HK_FLAG_NOHZ_FULL)) tick_nohz_full_setup(non_housekeeping_mask); housekeeping.flags |= flags;
@@ -179,12 +174,7 @@ static int __init housekeeping_setup(char *str, unsigned long flags) static int __init housekeeping_nohz_full_setup(char *str) { - unsigned long flags; - - flags = HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | HK_FLAG_RCU | - HK_FLAG_MISC | HK_FLAG_KTHREAD; - - return housekeeping_setup(str, flags); + return housekeeping_setup(str, HK_FLAG_NOHZ_FULL); } __setup("nohz_full=", housekeeping_nohz_full_setup);
@@ -198,8 +188,7 @@ static int __init housekeeping_isolcpus_setup(char *str) while (isalpha(*str)) { if (!strncmp(str, "nohz,", 5)) { str += 5; - flags |= HK_FLAG_TICK | HK_FLAG_WQ | HK_FLAG_TIMER | - HK_FLAG_RCU | HK_FLAG_MISC | HK_FLAG_KTHREAD; + flags |= HK_FLAG_NOHZ_FULL; continue; }
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 20a7a55e62b6..3e9636f4bac6 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c@@ -852,7 +852,7 @@ void __init lockup_detector_init(void) pr_info("Disabling watchdog on nohz_full cores by default\n"); cpumask_copy(&watchdog_cpumask, - housekeeping_cpumask(HK_TYPE_TIMER)); + housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); if (!watchdog_nmi_probe()) nmi_watchdog_available = true;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1ea50f6be843..3eb283d76d81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c@@ -5993,7 +5993,7 @@ void __init workqueue_init_early(void) BUILD_BUG_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); BUG_ON(!alloc_cpumask_var(&wq_unbound_cpumask, GFP_KERNEL)); - cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_WQ)); + cpumask_copy(wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); cpumask_and(wq_unbound_cpumask, wq_unbound_cpumask, housekeeping_cpumask(HK_TYPE_DOMAIN)); pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index e319e242dddf..6dddf359b754 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c@@ -852,7 +852,7 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue, if (!cpumask_empty(mask)) { cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_DOMAIN)); - cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_WQ)); + cpumask_and(mask, mask, housekeeping_cpumask(HK_TYPE_NOHZ_FULL)); if (cpumask_empty(mask)) { free_cpumask_var(mask); return -EINVAL;
--
2.25.1