Thread (32 messages) 32 messages, 3 authors, 2025-08-06

Re: [PATCH 4/5] sched: Add rt task enqueue/dequeue trace points

From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: 2025-08-01 09:56:45
Also in: lkml
Subsystem: scheduler, the rest · Maintainers: Ingo Molnar, Peter Zijlstra, Juri Lelli, Vincent Guittot, Linus Torvalds

Hello Nam,

On 8/1/2025 12:59 PM, Nam Cao wrote:
On Fri, Aug 01, 2025 at 09:12:08AM +0530, K Prateek Nayak wrote:
quoted
Just thinking out loud, putting this tracepoint here can lead to a
"dequeued -> dequeued" transition for fair task when they are in delayed
dequeue state.

    dequeue_task(p)
      trace_dequeue_task_tp(p) # First time
      dequeue_task_fair(p)
        p->se.delayed = 1
    ...
    <sched_switch> # p is still delayed
    ...
    sched_setscheduler(p)
      if (prev_class != next_class && p->se.sched_delayed)
        dequeue_task(p, DEQUEUE_DELAYED);
          trace_dequeue_task_tp(p) # Second time

It is not an issue as such but it might come as a surprise if users are
expecting a behavior like below which would be the case for !fair task
currently (and for all tasks before v6.12):

    digraph state_automaton {
        center = true;
        size = "7,11";
        {node [shape = plaintext, style=invis, label=""] "__init_enqueue_dequeue_cycle"};
        {node [shape = ellipse] "enqueued"};
        {node [shape = ellipse] "dequeued"};
        "__init_enqueue_dequeue_cycle" -> "enqueued";
        "__init_enqueue_dequeue_cycle" -> "dequeued";
        "enqueued" [label = "enqueued", color = green3];
        "enqueued" -> "dequeued" [ label = "dequeue_task" ];
        "dequeued" [label = "dequeued", color = red];
        "dequeued" -> "enqueued" [ label = "enqueue_task" ];
        { rank = min ;
            "__init_enqueue_dequeue_cycle";
            "dequeued";
            "enqueued";
        }
    }


Another:

   "dequeued" -> "dequeued" [ label = "dequeue_task" ];

edge would be needed in that case for >= v6.12. It is probably nothing
and can be easily handled by the users if they run into it but just
putting it out there for the record in case you only want to consider a
complete dequeue as "dequeued". Feel free to ignore since I'm completely
out of my depth when it comes to the usage of RV in the field :)
Ah, thanks for pointing this out. I do want to only consider complete
dequeue as "dequeued".

These tracepoints are not visible from userspace, and RV does not care
about enqueue/dequeue of fair tasks at the moment, so it is not a problem
for now. But as a precaution, I trust the below patch will do.
There are a few more cases with delayed dequeue:

1. A delayed task can be reenqueued before it is completely dequeued and
   will lead to a enqueue -> enqueue transition if we don't trace the
   first dequeue.

2. There are cases in set_user_nice() and __sched_setscheduler() where
   a delayed task is dequeued in delayed state and be put back in the
   cfs_rq in delayed state - an attempt to handle 1. can trip this.

Best I could think of is the below diff on top of your suggestion where
a "delayed -> reenqueue" is skipped but the case 2. is captures in case
one needs to inspect some properties from set_user_nice() /
__sched_setscheduler():

(only build tested on top of the diff you had pasted)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 9598984bee8d..1fc5a97bba6b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2071,7 +2071,8 @@ unsigned long get_wchan(struct task_struct *p)
 
 void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	trace_enqueue_task_tp(rq->cpu, p);
+	if (!p->se.sched_delayed || !move_entity(flags))
+		trace_enqueue_task_tp(rq->cpu, p);
 
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b173a059315c..1e2a636d6804 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5444,7 +5444,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	 * put back on, and if we advance min_vruntime, we'll be placed back
 	 * further than we started -- i.e. we'll be penalized.
 	 */
-	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
+	if (move_entity(flags))
 		update_min_vruntime(cfs_rq);
 
 	if (flags & DEQUEUE_DELAYED)
@@ -7054,6 +7054,7 @@ static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags)
 
 		/* Fix-up what dequeue_task_fair() skipped */
 		hrtick_update(rq);
+		trace_dequeue_task_tp(rq->cpu, p);
 
 		/*
 		 * Fix-up what block_task() skipped.
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index 7936d4333731..33897d35744a 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1196,19 +1196,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	dec_rt_group(rt_se, rt_rq);
 }
 
-/*
- * Change rt_se->run_list location unless SAVE && !MOVE
- *
- * assumes ENQUEUE/DEQUEUE flags match
- */
-static inline bool move_entity(unsigned int flags)
-{
-	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
-		return false;
-
-	return true;
-}
-
 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
 {
 	list_del_init(&rt_se->run_list);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index d3f33d10c58c..37730cd834ba 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2361,6 +2361,20 @@ extern const u32		sched_prio_to_wmult[40];
 
 #define RETRY_TASK		((void *)-1UL)
 
+/*
+ * Checks for a SAVE/RESTORE without MOVE. Returns false if
+ * SAVE and !MOVE.
+ *
+ * Assumes ENQUEUE/DEQUEUE flags match.
+ */
+static inline bool move_entity(unsigned int flags)
+{
+	if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+		return false;
+
+	return true;
+}
+
 struct affinity_context {
 	const struct cpumask	*new_mask;
 	struct cpumask		*user_mask;
---
Thoughts?

-- 
Thanks and Regards,
Prateek

P.S. move_entity() probably requires a better naming and perhaps can
even be simplified. I wrote out the below table just to convince myself
to reuse move_entity() in fair.c

    flags contains    (SAVE | MOVE)   (SAVE | MOVE)
                         == SAVE         != SAVE

    neiter                false           true
    SAVE                  true            false
    MOVE                  false           true
    SAVE | MOVE           false           true
quoted hunk ↗ jump to hunk
Nam
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index c38f12f7f903..b50668052f99 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -906,6 +906,14 @@ DECLARE_TRACE(dequeue_task_rt,
 	TP_PROTO(int cpu, struct task_struct *task),
 	TP_ARGS(cpu, task));
 
+DECLARE_TRACE(enqueue_task,
+	TP_PROTO(int cpu, struct task_struct *task),
+	TP_ARGS(cpu, task));
+
+DECLARE_TRACE(dequeue_task,
+	TP_PROTO(int cpu, struct task_struct *task),
+	TP_ARGS(cpu, task));
+
 #endif /* _TRACE_SCHED_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b485e0639616..553c08a63395 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2077,6 +2077,8 @@ unsigned long get_wchan(struct task_struct *p)
 
 void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 {
+	trace_enqueue_task_tp(rq->cpu, p);
+
 	if (!(flags & ENQUEUE_NOCLOCK))
 		update_rq_clock(rq);
 
@@ -2119,7 +2121,11 @@ inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	 * and mark the task ->sched_delayed.
 	 */
 	uclamp_rq_dec(rq, p);
-	return p->sched_class->dequeue_task(rq, p, flags);
+	if (p->sched_class->dequeue_task(rq, p, flags)) {
+		trace_dequeue_task_tp(rq->cpu, p);
+		return true;
+	}
+	return false;
 }
 
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
  
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help