[PATCH v10 3/3] sched/fair: Allocate cfs_tg_state with percpu allocator
From: Zecheng Li <hidden>
Date: 2026-05-22 14:16:33
Subsystem:
scheduler, the rest · Maintainers:
Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Linus Torvalds
From: Zecheng Li <redacted> To remove the cfs_rq pointer array in task_group, allocate the combined cfs_rq and sched_entity using the per-cpu allocator. This patch implements the following: - Changes task_group->cfs_rq from struct cfs_rq ** to struct cfs_rq __percpu *. - Updates memory allocation in alloc_fair_sched_group() and free_fair_sched_group() to use alloc_percpu() and free_percpu() respectively. - Uses the inline accessor tg_cfs_rq(tg, cpu) with per_cpu_ptr() to retrieve the pointer to cfs_rq for the given task group and CPU. - Replaces direct accesses tg->cfs_rq[cpu] with calls to the new tg_cfs_rq(tg, cpu) helper. - Handles the root_task_group: since struct rq is already a per-cpu variable (runqueues), its embedded cfs_rq (rq->cfs) is also per-cpu. Therefore, we assign root_task_group.cfs_rq = &runqueues.cfs. - Cleanup the code in initializing the root task group. This change places each CPU's cfs_rq and sched_entity in its local per-cpu memory area to remove the per-task_group pointer arrays. Signed-off-by: Zecheng Li <redacted> Signed-off-by: Zecheng Li <redacted> Reviewed-by: K Prateek Nayak <kprateek.nayak@amd.com> Reviewed-by: Josh Don <redacted> --- kernel/sched/core.c | 35 +++++++++++----------------- kernel/sched/fair.c | 54 ++++++++++++++++++-------------------------- kernel/sched/sched.h | 14 ++++++++---- 3 files changed, 45 insertions(+), 58 deletions(-)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 86fbb38901aa..163930370cd0 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c@@ -8884,7 +8884,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; void __init sched_init(void) { - unsigned long ptr = 0; + unsigned long __maybe_unused ptr = 0; int i; /* Make sure the linker didn't screw up */
@@ -8900,33 +8900,24 @@ void __init sched_init(void) wait_bit_init(); #ifdef CONFIG_FAIR_GROUP_SCHED - ptr += nr_cpu_ids * sizeof(void **); -#endif -#ifdef CONFIG_RT_GROUP_SCHED - ptr += 2 * nr_cpu_ids * sizeof(void **); -#endif - if (ptr) { - ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.cfs_rq = &runqueues.cfs; -#ifdef CONFIG_FAIR_GROUP_SCHED - root_task_group.cfs_rq = (struct cfs_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); - - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); + root_task_group.shares = ROOT_TASK_GROUP_LOAD; + init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_EXT_GROUP_SCHED - scx_tg_init(&root_task_group); + scx_tg_init(&root_task_group); #endif /* CONFIG_EXT_GROUP_SCHED */ #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + ptr += 2 * nr_cpu_ids * sizeof(void **); + ptr = (unsigned long)kzalloc(ptr, GFP_NOWAIT); + root_task_group.rt_se = (struct sched_rt_entity **)ptr; + ptr += nr_cpu_ids * sizeof(void **); - root_task_group.rt_rq = (struct rt_rq **)ptr; - ptr += nr_cpu_ids * sizeof(void **); + root_task_group.rt_rq = (struct rt_rq **)ptr; + ptr += nr_cpu_ids * sizeof(void **); #endif /* CONFIG_RT_GROUP_SCHED */ - } init_defrootdomain();
@@ -9841,7 +9832,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, } for_each_online_cpu(i) { - struct cfs_rq *cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, i); struct rq *rq = cfs_rq->rq; guard(rq_lock_irq)(rq);
@@ -10009,7 +10000,7 @@ static u64 throttled_time_self(struct task_group *tg) u64 total = 0; for_each_possible_cpu(i) { - total += READ_ONCE(tg->cfs_rq[i]->throttled_clock_self_time); + total += READ_ONCE(tg_cfs_rq(tg, i)->throttled_clock_self_time); } return total;
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index db23951d6ba0..9f5b3b3e738c 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c@@ -334,7 +334,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * to a tree or when we reach the top of the tree */ if (cfs_rq->tg->parent && - cfs_rq->tg->parent->cfs_rq[cpu]->on_list) { + tg_cfs_rq(cfs_rq->tg->parent, cpu)->on_list) { /* * If parent is already on the list, we add the child * just before. Thanks to circular linked property of
@@ -342,7 +342,7 @@ static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq) * of the list that starts by parent. */ list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list, - &(cfs_rq->tg->parent->cfs_rq[cpu]->leaf_cfs_rq_list)); + &(tg_cfs_rq(cfs_rq->tg->parent, cpu)->leaf_cfs_rq_list)); /* * The branch is now connected to its tree so we can * reset tmp_alone_branch to the beginning of the
@@ -5008,7 +5008,7 @@ static void __maybe_unused clear_tg_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); clear_tg_load_avg(cfs_rq); }
@@ -6565,7 +6565,7 @@ static inline int throttled_hierarchy(struct cfs_rq *cfs_rq) static inline int lb_throttled_hierarchy(struct task_struct *p, int dst_cpu) { - return throttled_hierarchy(task_group(p)->cfs_rq[dst_cpu]); + return throttled_hierarchy(tg_cfs_rq(task_group(p), dst_cpu)); } static inline bool task_is_throttled(struct task_struct *p)
@@ -6711,7 +6711,7 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags); static int tg_unthrottle_up(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); struct task_struct *p, *tmp; if (--cfs_rq->throttle_count)
@@ -6782,7 +6782,7 @@ static void record_throttle_clock(struct cfs_rq *cfs_rq) static int tg_throttle_down(struct task_group *tg, void *data) { struct rq *rq = data; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (cfs_rq->throttle_count++) return 0;
@@ -7256,8 +7256,8 @@ static void sync_throttle(struct task_group *tg, int cpu) if (!tg->parent) return; - cfs_rq = tg->cfs_rq[cpu]; - pcfs_rq = tg->parent->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(tg, cpu); + pcfs_rq = tg_cfs_rq(tg->parent, cpu); cfs_rq->throttle_count = pcfs_rq->throttle_count; cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
@@ -7449,7 +7449,7 @@ static void __maybe_unused update_runtime_enabled(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth; - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); raw_spin_lock(&cfs_b->lock); cfs_rq->runtime_enabled = cfs_b->quota != RUNTIME_INF;
@@ -7478,7 +7478,7 @@ static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq) rcu_read_lock(); list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu_of(rq)); if (!cfs_rq->runtime_enabled) continue;
@@ -10403,7 +10403,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ struct cfs_rq *dst_cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - dst_cfs_rq = task_group(p)->cfs_rq[dest_cpu]; + dst_cfs_rq = tg_cfs_rq(task_group(p), dest_cpu); #else dst_cfs_rq = &cpu_rq(dest_cpu)->cfs; #endif
@@ -14833,7 +14833,7 @@ static int task_is_throttled_fair(struct task_struct *p, int cpu) struct cfs_rq *cfs_rq; #ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq = task_group(p)->cfs_rq[cpu]; + cfs_rq = tg_cfs_rq(task_group(p), cpu); #else cfs_rq = &cpu_rq(cpu)->cfs; #endif
@@ -15098,39 +15098,31 @@ static void task_change_group_fair(struct task_struct *p) void free_fair_sched_group(struct task_group *tg) { - int i; - - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - } - - kfree(tg->cfs_rq); + free_percpu(tg->cfs_rq); } int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) { - struct cfs_tg_state *state; + struct cfs_tg_state __percpu *state; struct sched_entity *se; struct cfs_rq *cfs_rq; int i; - tg->cfs_rq = kzalloc_objs(cfs_rq, nr_cpu_ids); - if (!tg->cfs_rq) + state = alloc_percpu_gfp(struct cfs_tg_state, GFP_KERNEL); + if (!state) goto err; + tg->cfs_rq = &state->cfs_rq; tg->shares = NICE_0_LOAD; init_cfs_bandwidth(tg_cfs_bandwidth(tg), tg_cfs_bandwidth(parent)); for_each_possible_cpu(i) { - state = kzalloc_node(sizeof(*state), - GFP_KERNEL, cpu_to_node(i)); - if (!state) + cfs_rq = tg_cfs_rq(tg, i); + if (!cfs_rq) goto err; - cfs_rq = &state->cfs_rq; - se = &state->se; + se = tg_se(tg, i); init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, tg_se(parent, i)); init_entity_runnable_average(se);
@@ -15167,7 +15159,7 @@ void unregister_fair_sched_group(struct task_group *tg) destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); for_each_possible_cpu(cpu) { - struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; + struct cfs_rq *cfs_rq = tg_cfs_rq(tg, cpu); struct sched_entity *se = tg_se(tg, cpu); struct rq *rq = cpu_rq(cpu);
@@ -15204,8 +15196,6 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, cfs_rq->rq = rq; init_cfs_rq_runtime(cfs_rq); - tg->cfs_rq[cpu] = cfs_rq; - /* se could be NULL for root_task_group */ if (!se) return;
@@ -15298,7 +15288,7 @@ int sched_group_set_idle(struct task_group *tg, long idle) for_each_possible_cpu(i) { struct rq *rq = cpu_rq(i); struct sched_entity *se = tg_se(tg, i); - struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i]; + struct cfs_rq *grp_cfs_rq = tg_cfs_rq(tg, i); bool was_idle = cfs_rq_is_idle(grp_cfs_rq); long idle_task_delta; struct rq_flags rf;
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 63574f9d57f1..95a22baa172b 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h@@ -481,7 +481,7 @@ struct task_group { #ifdef CONFIG_FAIR_GROUP_SCHED /* runqueue "owned" by this group on each CPU */ - struct cfs_rq **cfs_rq; + struct cfs_rq __percpu *cfs_rq; unsigned long shares; /* * load_avg can be heavily contended at clock tick time, so put
@@ -2299,6 +2299,12 @@ struct cfs_tg_state { struct sched_statistics stats; } __no_randomize_layout; +/* Access a specific CPU's cfs_rq from a task group */ +static inline struct cfs_rq *tg_cfs_rq(struct task_group *tg, int cpu) +{ + return per_cpu_ptr(tg->cfs_rq, cpu); +} + static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) { struct cfs_tg_state *state;
@@ -2306,7 +2312,7 @@ static inline struct sched_entity *tg_se(struct task_group *tg, int cpu) if (is_root_task_group(tg)) return NULL; - state = container_of(tg->cfs_rq[cpu], struct cfs_tg_state, cfs_rq); + state = container_of(tg_cfs_rq(tg, cpu), struct cfs_tg_state, cfs_rq); return &state->se; }
@@ -2330,8 +2336,8 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED - set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); - p->se.cfs_rq = tg->cfs_rq[cpu]; + set_task_rq_fair(&p->se, p->se.cfs_rq, tg_cfs_rq(tg, cpu)); + p->se.cfs_rq = tg_cfs_rq(tg, cpu); p->se.parent = tg_se(tg, cpu); p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0; #endif
--
2.54.0