Thread (17 messages) 17 messages, 7 authors, 2023-01-19

Re: sched/debug: CPU hotplug operation suffers in a large cpu systems

From: Phil Auld <hidden>
Date: 2022-12-14 02:27:10
Also in: lkml

On Wed, Dec 14, 2022 at 10:41:25AM +1100 Michael Ellerman wrote:
Phil Auld [off-list ref] writes:
quoted
On Tue, Dec 13, 2022 at 07:23:54AM +0100 Greg Kroah-Hartman wrote:
quoted
On Mon, Dec 12, 2022 at 02:17:58PM -0500, Phil Auld wrote:
quoted
Hi,

On Tue, Nov 08, 2022 at 01:24:39PM +0100 Greg Kroah-Hartman wrote:
quoted
On Tue, Nov 08, 2022 at 03:30:46PM +0530, Vishal Chourasia wrote:
quoted
Thanks Greg & Peter for your direction. 

While we pursue the idea of having debugfs based on kernfs, we thought about
having a boot time parameter which would disable creating and updating of the
sched_domain debugfs files and this would also be useful even when the kernfs
solution kicks in, as users who may not care about these debugfs files would
benefit from a faster CPU hotplug operation.
Ick, no, you would be adding a new user/kernel api that you will be
required to support for the next 20+ years.  Just to get over a
short-term issue before you solve the problem properly.
I'm not convinced moving these files from debugfs to kernfs is the right
fix.  That will take it from ~50 back to ~20 _minutes_ on these systems.
I don't think either of those numbers is reasonable.

The issue as I see it is the full rebuild for every change with no way to
batch the changes. How about something like the below?

This puts the domains/* files under the sched_verbose flag. About the only
thing under that flag now are the detailed topology discovery printks anyway
so this fits together nicely.

This way the files would be off by default (assuming you don't boot with
sched_verbose) and can be created at runtime by enabling verbose. Multiple
changes could also be batched by disabling/makeing changes/re-enabling.

It does not create a new API, uses one that is already there.
The idea seems good, the implementation might need a bit of work :)
More than the one comment below? Let me know.
quoted
quoted
quoted
If you really do not want these debugfs files, just disable debugfs from
your system.  That should be a better short-term solution, right?
We do find these files useful at times for debugging issue and looking
at what's going on on the system.
quoted
Or better yet, disable SCHED_DEBUG, why can't you do that?
Same with this... useful information with (modulo issues like this)
small cost. There are also tuning knobs that are only available
with SCHED_DEBUG. 


Cheers,
Phil

---------------

sched/debug: Put sched/domains files under verbose flag

The debug files under sched/domains can take a long time to regenerate,
especially when updates are done one at a time. Move these files under
the verbose debug flag. Allow changes to verbose to trigger generation
of the files. This lets a user batch the updates but still have the
information available.  The detailed topology printk messages are also
under verbose.

Signed-off-by: Phil Auld <redacted>
---
 kernel/sched/debug.c | 68 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 66 insertions(+), 2 deletions(-)
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..2eb51ee3ccab 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -280,6 +280,31 @@ static const struct file_operations sched_dynamic_fops = {
 
 __read_mostly bool sched_debug_verbose;
 
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos);
+
+static int sched_verbose_show(struct seq_file *m, void *v)
+{
+	if (sched_debug_verbose)
+		seq_puts(m,"Y\n");
+	else
+		seq_puts(m,"N\n");
+	return 0;
+}
+
+static int sched_verbose_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, sched_verbose_show, NULL);
+}
+
+static const struct file_operations sched_verbose_fops = {
+	.open		= sched_verbose_open,
+	.write		= sched_verbose_write,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static const struct seq_operations sched_debug_sops;
 
 static int sched_debug_open(struct inode *inode, struct file *filp)
@@ -303,7 +328,7 @@ static __init int sched_init_debug(void)
 	debugfs_sched = debugfs_create_dir("sched", NULL);
 
 	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
-	debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+	debugfs_create_file("verbose", 0644, debugfs_sched, NULL, &sched_verbose_fops);
 #ifdef CONFIG_PREEMPT_DYNAMIC
 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 #endif
@@ -402,15 +427,23 @@ void update_sched_domain_debugfs(void)
 	if (!debugfs_sched)
 		return;
 
+	if (!sched_debug_verbose)
+		return;
+
 	if (!cpumask_available(sd_sysctl_cpus)) {
 		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
 			return;
 		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
 	}
 
-	if (!sd_dentry)
+	if (!sd_dentry) {
 		sd_dentry = debugfs_create_dir("domains", debugfs_sched);
 
+		/* rebuild sd_sysclt_cpus if empty since it gets cleared below */
+		if (cpumask_first(sd_sysctl_cpus) >=  nr_cpu_ids)
+			cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
+	}
+
 	for_each_cpu(cpu, sd_sysctl_cpus) {
 		struct sched_domain *sd;
 		struct dentry *d_cpu;
@@ -443,6 +476,37 @@ void dirty_sched_domain_sysctl(int cpu)
 
 #endif /* CONFIG_SMP */
 
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	bool orig = sched_debug_verbose;
+	bool bv;
+	int r;
+
+	r = kstrtobool_from_user(ubuf, cnt, &bv);
+	if (!r) {
+		mutex_lock(&sched_domains_mutex);
+		r = debugfs_file_get(dentry);
+		if (unlikely(r))
+			return r;
+		sched_debug_verbose = bv;
+		debugfs_file_put(dentry);
Why the get/put of the debugfs dentry? for just this single value?
That's what debugfs_file_write_bool() does, which is where I got that since
that's really what this is doing. I couldn't see a good way to make this
just call that.
I think you can do it like below? Only lightly tested :)
That simplifies things.  Thanks!

I'm testing a new version now but will switch to this method and see what
happens.


Cheers,
Phil

quoted hunk ↗ jump to hunk
cheers

diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 1637b65ba07a..bc96380cf336 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -280,6 +279,42 @@ static const struct file_operations sched_dynamic_fops = {
 
 __read_mostly bool sched_debug_verbose;
 
+#ifdef CONFIG_SMP
+static struct dentry		*sd_dentry;
+
+static ssize_t sched_verbose_write(struct file *filp, const char __user *ubuf,
+				   size_t cnt, loff_t *ppos)
+{
+	ssize_t result;
+	bool orig;
+
+	mutex_lock(&sched_domains_mutex);
+
+	orig = sched_debug_verbose;
+	result = debugfs_write_file_bool(filp, ubuf, cnt, ppos);
+
+	if (sched_debug_verbose && !orig)
+		update_sched_domain_debugfs();
+	else if (!sched_debug_verbose && orig) {
+		debugfs_remove(sd_dentry);
+		sd_dentry = NULL;
+	}
+
+	mutex_unlock(&sched_domains_mutex);
+
+	return result;
+}
+#else
+#define sched_verbose_write debugfs_write_file_bool
+#endif
+
+static const struct file_operations sched_verbose_fops = {
+	.read =		debugfs_read_file_bool,
+	.write =	sched_verbose_write,
+	.open =		simple_open,
+	.llseek =	default_llseek,
+};
+
 static const struct seq_operations sched_debug_sops;
 
 static int sched_debug_open(struct inode *inode, struct file *filp)
@@ -303,7 +338,7 @@ static __init int sched_init_debug(void)
 	debugfs_sched = debugfs_create_dir("sched", NULL);
 
 	debugfs_create_file("features", 0644, debugfs_sched, NULL, &sched_feat_fops);
-	debugfs_create_bool("verbose", 0644, debugfs_sched, &sched_debug_verbose);
+	debugfs_create_file_unsafe("verbose", 0644, debugfs_sched, &sched_debug_verbose, &sched_verbose_fops);
 #ifdef CONFIG_PREEMPT_DYNAMIC
 	debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops);
 #endif
@@ -345,7 +380,6 @@ late_initcall(sched_init_debug);
 #ifdef CONFIG_SMP
 
 static cpumask_var_t		sd_sysctl_cpus;
-static struct dentry		*sd_dentry;
 
 static int sd_flags_show(struct seq_file *m, void *v)
 {
@@ -402,15 +436,23 @@ void update_sched_domain_debugfs(void)
 	if (!debugfs_sched)
 		return;
 
+	if (!sched_debug_verbose)
+		return;
+
 	if (!cpumask_available(sd_sysctl_cpus)) {
 		if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
 			return;
 		cpumask_copy(sd_sysctl_cpus, cpu_possible_mask);
 	}
 
-	if (!sd_dentry)
+	if (!sd_dentry) {
 		sd_dentry = debugfs_create_dir("domains", debugfs_sched);
 
+		/* rebuild sd_sysclt_cpus if empty since it gets cleared below */
+		if (cpumask_first(sd_sysctl_cpus) >=  nr_cpu_ids)
+			cpumask_copy(sd_sysctl_cpus, cpu_online_mask);
+	}
+
 	for_each_cpu(cpu, sd_sysctl_cpus) {
 		struct sched_domain *sd;
 		struct dentry *d_cpu;
-- 
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help