Thread (5 messages) 5 messages, 3 authors, 2009-04-10

Re: [PATCH 2/2] sysctl: lockdep support for sysctl reference counting.

From: Peter Zijlstra <peterz@infradead.org>
Date: 2009-03-31 08:17:47
Also in: lkml

Possibly related (same subject, not in this thread)

On Sat, 2009-03-21 at 00:42 -0700, Eric W. Biederman wrote:
It is possible for get lock ordering deadlocks between locks
and waiting for the sysctl used count to drop to zero.  We have
recently observed one of these in the networking code.

So teach the sysctl code how to speak lockdep so the kernel
can warn about these kinds of rare issues proactively.
It would be very good to extend this changelog with a more detailed
explanation of the deadlock in question.

Let me see if I got it right:

We're holding a lock, while waiting for the refcount to drop to 0.
Dropping that refcount is blocked on that lock.

Something like that?
quoted hunk ↗ jump to hunk
Signed-off-by: Eric Biederman <redacted>
---
 include/linux/sysctl.h |    4 ++
 kernel/sysctl.c        |  108 ++++++++++++++++++++++++++++++++++++++---------
 2 files changed, 91 insertions(+), 21 deletions(-)
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39d471d..ec9b1dd 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -28,6 +28,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/compiler.h>
+#include <linux/lockdep.h>
 
 struct file;
 struct completion;
@@ -1087,6 +1088,9 @@ struct ctl_table_header
 	struct ctl_table *attached_by;
 	struct ctl_table *attached_to;
 	struct ctl_table_header *parent;
+#ifdef CONFIG_PROVE_LOCKING
+	struct lockdep_map dep_map;
+#endif
 };
 
 /* struct ctl_path describes where in the hierarchy a table is added */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44f..ea8cc39 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1454,12 +1454,63 @@ static struct ctl_table dev_table[] = {
 
 static DEFINE_SPINLOCK(sysctl_lock);
 
+#ifndef CONFIG_PROVE_LOCKING
+
+# define lock_sysctl() spin_lock(&sysctl_lock)
+# define unlock_sysctl() spin_unlock(&sysctl_lock)
+
+static inline void table_acquire_use(struct ctl_table_header *hdr) { }
+static inline void table_release_use(struct ctl_table_header *hdr) { }
+static inline void table_acquire(struct ctl_table_header *hdr) { }
+static inline void table_contended(struct ctl_table_header *hdr) { }
+static inline void table_acquired(struct ctl_table_header *hdr) { }
+static inline void table_release(struct ctl_table_header *hdr) { }
+
+#else	/* CONFIG_PROVE_LOCKING */
+
+#  define lock_sysctl() __raw_spin_lock(&sysctl_lock.raw_lock)
+#  define unlock_sysctl() __raw_spin_unlock(&sysctl_lock.raw_lock)
Uhmm, Please explain that -- without a proper explanation this is a NAK.
quoted hunk ↗ jump to hunk
+static inline void table_acquire_use(struct ctl_table_header *hdr)
+{
+	lock_acquire(&hdr->dep_map, 0, 0, 1, 2, NULL, _RET_IP_);
+	lock_acquired(&hdr->dep_map, _RET_IP_);
+}
+
+static inline void table_release_use(struct ctl_table_header *hdr)
+{
+	lock_release(&hdr->dep_map, 0, _RET_IP_);
+}
+
+static inline void table_acquire(struct ctl_table_header *hdr)
+{
+	lock_acquire(&hdr->dep_map, 0, 0, 0, 2, NULL, _RET_IP_);
+}
+
+static inline void table_contended(struct ctl_table_header *hdr)
+{
+	lock_contended(&hdr->dep_map, _RET_IP_);
+}
+
+static inline void table_acquired(struct ctl_table_header *hdr)
+{
+	lock_acquired(&hdr->dep_map, _RET_IP_);
+}
+
+static inline void table_release(struct ctl_table_header *hdr)
+{
+	lock_release(&hdr->dep_map, 0, _RET_IP_);
+}
+
+#endif /* CONFIG_PROVE_LOCKING */
+
 /* called under sysctl_lock */
 static int use_table(struct ctl_table_header *p)
 {
 	if (unlikely(p->unregistering))
 		return 0;
 	p->used++;
+	table_acquire_use(p);
 	return 1;
 }
 
@@ -1469,6 +1520,8 @@ static void unuse_table(struct ctl_table_header *p)
 	if (!--p->used)
 		if (unlikely(p->unregistering))
 			complete(p->unregistering);
+
+	table_release_use(p);
 }
 
 /* called under sysctl_lock, will reacquire if has to wait */
@@ -1478,47 +1531,54 @@ static void start_unregistering(struct ctl_table_header *p)
 	 * if p->used is 0, nobody will ever touch that entry again;
 	 * we'll eliminate all paths to it before dropping sysctl_lock
 	 */
+	table_acquire(p);
 	if (unlikely(p->used)) {
 		struct completion wait;
+		table_contended(p);
+
 		init_completion(&wait);
 		p->unregistering = &wait;
-		spin_unlock(&sysctl_lock);
+		unlock_sysctl();
 		wait_for_completion(&wait);
-		spin_lock(&sysctl_lock);
+		lock_sysctl();
 	} else {
 		/* anything non-NULL; we'll never dereference it */
 		p->unregistering = ERR_PTR(-EINVAL);
 	}
+	table_acquired(p);
+
 	/*
 	 * do not remove from the list until nobody holds it; walking the
 	 * list in do_sysctl() relies on that.
 	 */
 	list_del_init(&p->ctl_entry);
+
+	table_release(p);
 }
 
quoted hunk ↗ jump to hunk
@@ -1951,7 +2011,13 @@ struct ctl_table_header *__register_sysctl_paths(
 		return NULL;
 	}
 #endif
-	spin_lock(&sysctl_lock);
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	{
+		static struct lock_class_key __key;
+		lockdep_init_map(&header->dep_map, "sysctl_used", &__key, 0);
+	}
+#endif	
This means every sysctl thingy gets the same class, is that
intended/desired?
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help