Re: [RFC PATCH 1/1] psi: Introduce in-kernel PSI auto monitor feature

From: K Prateek Nayak <kprateek.nayak@amd.com>
Date: 2026-07-02 19:51:27
Also in: lkml

Hello Pintu,

On 7/2/2026 10:46 PM, Pintu Kumar Agarwal wrote:

quoted hunk ↗ jump to hunk

diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index e2cf3b08d4e9..30e9800ce947 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c

@@ -104,3 +104,7 @@
 #ifdef CONFIG_SCHED_AUTOGROUP
 # include "autogroup.c"
 #endif
+
+#ifdef CONFIG_PSI_AUTO_MONITOR
+# include "psi_monitor.c"
+#endif

Isn't this a module? Why is this being included as a scheduler file?
Based on a quick glance, nothing in this module needs scheduler internal
APIs (and nor it should) so tools/sched/ would probabaly be a better
place to put it in if there is interest for this feature.

quoted hunk ↗ jump to hunk

diff --git a/kernel/sched/psi_monitor.c b/kernel/sched/psi_monitor.c
new file mode 100644
index 000000000000..e929a0c05494
--- /dev/null
+++ b/kernel/sched/psi_monitor.c

@@ -0,0 +1,307 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * PSI Automatic Monitor with Weighted Task Ranking + Tracepoints
+ *
+ * Periodically samples system PSI (CPU, memory, IO) and, when any
+ * configured threshold is exceeded, ranks tasks using a composite
+ * score based on RSS, I/O activity and CPU time, then logs the
+ * top-N tasks via printk and a tracepoint.
+ *
+ * Sysfs interface:
+ *   /sys/kernel/psi_monitor/cpu_thresh                 (percentage)
+ *   /sys/kernel/psi_monitor/mem_thresh                 (percentage)
+ *   /sys/kernel/psi_monitor/io_thresh          (percentage)
+ *   /sys/kernel/psi_monitor/monitor_interval_ms (milliseconds)
+ *   /sys/kernel/psi_monitor/rss_weight
+ *   /sys/kernel/psi_monitor/io_weight
+ *   /sys/kernel/psi_monitor/cpu_weight
+ *
+ * Author: Pintu Kumar Agarwal
+ */
+
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/sched/signal.h>
+#include <linux/sched/loadavg.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/workqueue.h>
+#include <linux/psi_types.h>
+#include <linux/kobject.h>
+#include <linux/sort.h>
+#include <linux/jiffies.h>
+#include <linux/time64.h>
+#include <linux/sched/cputime.h>
+
+/* Create tracepoints defined in include/trace/events/psi_monitor.h */
+#define CREATE_TRACE_POINTS
+#include <linux/psi.h>
+#include <trace/events/psi_monitor.h>
+
+
+/* Sysfs tunables */
+static unsigned int cpu_thresh = 80;     /* in percent */
+static unsigned int mem_thresh = 80;     /* in percent */
+static unsigned int io_thresh  = 80;     /* in percent */
+static unsigned int monitor_interval_ms = 10000;
+
+/* scoring weights */
+static unsigned int rss_weight = 2;
+static unsigned int io_weight  = 1;
+static unsigned int cpu_weight = 5;

Insanely configurable but what makes it easy for developers to know
the right configurations under severe pressure as you put it?

+
+static struct delayed_work psi_work;
+static struct kobject *psi_kobj;
+
+#define TOP_N 20
+
+struct task_info {
+       struct task_struct *task;
+       unsigned long rss;      /* pages */
+       unsigned long io_kb;    /* kB */
+       unsigned long cpu_ms;   /* ms */

Isn't the suffix selfexplanatory? Do you really need the comments?

+       u64 score;
+};
+
+/*
+ * psi_avg10_percent() - derive a rough integer percentage from avg10
+ * for a given PSI state (e.g. PSI_CPU_SOME, PSI_MEM_SOME, PSI_IO_SOME).
+ *
+ * psi_group.avg[state][0] is the avg10 window in fixed-point notation.
+ * The conversion here is approximate but monotonic, which is sufficient
+ * for thresholding and ranking in this internal monitor.
+ */
+static unsigned long psi_avg10_percent(int state)
+{
+       u64 avg10;
+
+       if (state < 0 || state >= NR_PSI_STATES)
+               return 0;
+
+       avg10 = READ_ONCE(psi_system.avg[state][0]);
+       if (!avg10)
+               return 0;
+
+       /* Convert back from loadavg-style fixed-point to an approximate % */
+       /* Just consider the integer value and ignore fraction */

Why two single line comments?

+       return LOAD_INT(avg10);
+}
+
+static int compare_score_desc(const void *a, const void *b)
+{
+       const struct task_info *ta = a;
+       const struct task_info *tb = b;
+
+       if (tb->score > ta->score)
+               return 1;
+       if (tb->score < ta->score)
+               return -1;
+       return 0;
+}
+
+static void log_top_tasks(void)
+{
+       struct task_info tasks[TOP_N];
+       struct task_struct *p, *t;
+       int count = 0;
+       int i;
+
+       rcu_read_lock();
+       for_each_process_thread(p, t) {

Thats a ton of work every 10s.

+               struct mm_struct *mm;
+               unsigned long rss = 0;
+               unsigned long io_kb = 0;
+               unsigned long cpu_ms = 0;
+               u64 score;
+
+               /* Ignore tasks that are not on run queue or idle */
+               if (!t->on_rq && !is_idle_task(t))

Condition doesn't match the comment. Tasks off rq that aren't idle will
still go through.

+                       continue;
+
+               mm = get_task_mm(t);
+
+               /* mm could be NULL for kernel threads */
+               if (mm) {
+                       rss = mm ? get_mm_rss(mm) : 0;
+                       mmput_async(mm);
+               }
+
+               /*
+                * Approximate I/O activity: sum of read + write bytes.
+                * This uses the task_io_accounting fields in task_struct.
+                * Values are best-effort and need not be perfectly accurate
+                * for our ranking purpose.
+                */
+               io_kb = (t->ioac.read_bytes + t->ioac.write_bytes) >> 10;
+
+               /*
+                * Approximate CPU usage via task_sched_runtime(), converted
+                * to milliseconds. This is cumulative since task start, but
+                * is still useful for comparing hotspots at a given point.
+                */
+               cpu_ms = (unsigned long)(task_sched_runtime(t) / NSEC_PER_MSEC);
+
+               score = (u64)rss_weight * (u64)rss +
+                       (u64)io_weight  * (u64)io_kb +
+                       (u64)cpu_weight * (u64)cpu_ms;
+
+               if (count < TOP_N) {
+                       tasks[count].task   = t;
+                       tasks[count].rss    = rss;
+                       tasks[count].io_kb  = io_kb;
+                       tasks[count].cpu_ms = cpu_ms;
+                       tasks[count].score  = score;
+                       count++;
+               } else {
+                       /* Maintain a simple streaming top-N: replace smallest */
+                       int min_idx = 0;
+                       int j;
+
+                       for (j = 1; j < TOP_N; j++) {
+                               if (tasks[j].score < tasks[min_idx].score)
+                                       min_idx = j;
+                       }

Can't you just cache the min_idx and re-compute it when it changes
instead of taking a O(20) iteration for every task?

+
+                       if (score > tasks[min_idx].score) {
+                               tasks[min_idx].task   = t;
+                               tasks[min_idx].rss    = rss;
+                               tasks[min_idx].io_kb  = io_kb;
+                               tasks[min_idx].cpu_ms = cpu_ms;
+                               tasks[min_idx].score  = score;
+                       }
+               }
+       }
+       rcu_read_unlock();
+
+       sort(tasks, count, sizeof(struct task_info), compare_score_desc, NULL);
+
+       pr_info("psi_monitor: logging top %d tasks under pressure:\n", count);
+
+       for (i = 0; i < count; i++) {
+               struct task_struct *ts = tasks[i].task;
+               unsigned long rss_kb = tasks[i].rss << (PAGE_SHIFT - 10);
+               char name[128] = {0,};
+
+               if (ts->flags & PF_WQ_WORKER)
+                       wq_worker_comm(name, sizeof(name), ts);
+               else
+                       scnprintf(name, sizeof(name) - 1, ts->comm);
+
+               trace_psi_monitor_top_task(ts->pid, name,
+                               tasks[i].cpu_ms,
+                               rss_kb,
+                               tasks[i].io_kb,
+                               tasks[i].score);
+
+               pr_info("psi_monitor: pid=%d comm=%s psi_flag=%d oncpu=%d cputime(ms)=%lu rss(kB)=%lu io(kB)=%lu score=%llu\n",
+                       ts->pid, name, ts->psi_flags, task_cpu(ts),
+                       tasks[i].cpu_ms, rss_kb, tasks[i].io_kb,
+                       (unsigned long long)tasks[i].score);

This will unnecessarily dump to dmesg even if you have tracevent
enabled. Why?

+               }
+}
+
+static void psi_monitor_fn(struct work_struct *work)
+{
+       unsigned long cpu_pct, mem_pct, io_pct;
+       bool trigger = false;
+
+       cpu_pct = psi_avg10_percent(PSI_CPU_SOME);
+       mem_pct = psi_avg10_percent(PSI_MEM_SOME);
+       io_pct  = psi_avg10_percent(PSI_IO_SOME);
+
+       if (cpu_pct >= cpu_thresh || mem_pct >= mem_thresh ||
+               io_pct >= io_thresh)
+               trigger = true;
+
+       if (trigger) {
+               pr_info("psi_monitor: pressure high: cpu=%lu%% mem=%lu%% io=%lu%% (thresh cpu=%u mem=%u io=%u)\n",
+                       cpu_pct, mem_pct, io_pct,
+                       cpu_thresh, mem_thresh, io_thresh);
+               log_top_tasks();
+       }
+
+       queue_delayed_work(system_wq, &psi_work,
+               msecs_to_jiffies(monitor_interval_ms));

If I set monitor_interval_ms to 6 hours, and then change it back to 10s,
it'll only take effect after this callback has fired 6 hours later.

+}
+
+/* Sysfs helpers */
+#define PSI_ATTR_RW(_name)                                             \
+static ssize_t _name##_show(struct kobject *kobj,                      \
+                       struct kobj_attribute *attr, char *buf)         \
+{                                                                      \
+       return sysfs_emit(buf, "%u\n", _name);                          \
+}                                                                      \
+static ssize_t _name##_store(struct kobject *kobj,                     \
+                           struct kobj_attribute *attr,                \
+                           const char *buf, size_t count)              \
+{                                                                      \
+       unsigned int val;                                               \
+       if (kstrtouint(buf, 10, &val))                                  \
+               return -EINVAL;                                         \
+       _name = val;                                                    \
+       return count;                                                   \
+}                                                                      \
+static struct kobj_attribute _name##_attr = __ATTR_RW(_name)
+
+PSI_ATTR_RW(cpu_thresh);
+PSI_ATTR_RW(mem_thresh);
+PSI_ATTR_RW(io_thresh);
+PSI_ATTR_RW(monitor_interval_ms);
+PSI_ATTR_RW(rss_weight);
+PSI_ATTR_RW(io_weight);
+PSI_ATTR_RW(cpu_weight);
+
+static struct attribute *psi_attrs[] = {
+       &cpu_thresh_attr.attr,
+       &mem_thresh_attr.attr,
+       &io_thresh_attr.attr,
+       &monitor_interval_ms_attr.attr,
+       &rss_weight_attr.attr,
+       &io_weight_attr.attr,
+       &cpu_weight_attr.attr,
+       NULL,
+};
+
+static const struct attribute_group psi_attr_group = {
+       .attrs = psi_attrs,
+};
+
+static int __init psi_monitor_init(void)
+{
+       int ret;
+
+       INIT_DELAYED_WORK(&psi_work, psi_monitor_fn);
+       queue_delayed_work(system_wq, &psi_work,
+                       msecs_to_jiffies(monitor_interval_ms));
+
+       psi_kobj = kobject_create_and_add("psi_monitor", kernel_kobj);
+       if (!psi_kobj)
+               return -ENOMEM;
+
+       ret = sysfs_create_group(psi_kobj, &psi_attr_group);
+       if (ret) {
+               kobject_put(psi_kobj);
+               cancel_delayed_work_sync(&psi_work);
+               return ret;
+       }
+
+       pr_info("psi_monitor: in-kernel PSI auto monitor (weighted + tracepoints) loaded\n");
+       return 0;
+}
+
+static void __exit psi_monitor_exit(void)
+{
+       cancel_delayed_work_sync(&psi_work);
+       if (psi_kobj)
+               kobject_put(psi_kobj);
+       pr_info("psi_monitor: unloaded\n");
+}
+
+module_init(psi_monitor_init);
+module_exit(psi_monitor_exit);

There is nothing here that warrants putting this in kernel/sched.
Also this gets included by default when config is enabled and starts
dumping a bunch of stats to dmesg without anyone asking. No?

Afaict, almost all of the detail used here is also available from
procfs and people can easily put together a userspace tool if they
need it. Why do we need an in-kernel module?

+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pintu Kumar Agarwal");
+MODULE_DESCRIPTION("In-kernel PSI automatic monitor with sysfs, weighted scoring and tracepoints");
--
2.34.1

-- 
Thanks and Regards,
Prateek

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help