Thread (67 messages) 67 messages, 9 authors, 2024-07-10

Re: [PATCH v2 04/12] uprobes: revamp uprobe refcounting and lifetime management

From: Peter Zijlstra <peterz@infradead.org>
Date: 2024-07-02 10:23:05
Also in: bpf

On Mon, Jul 01, 2024 at 03:39:27PM -0700, Andrii Nakryiko wrote:
quoted hunk ↗ jump to hunk
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
index 23449a8c5e7e..560cf1ca512a 100644
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -53,9 +53,10 @@ DEFINE_STATIC_PERCPU_RWSEM(dup_mmap_sem);
 
 struct uprobe {
 	struct rb_node		rb_node;	/* node in the rb tree */
-	refcount_t		ref;
+	atomic64_t		ref;		/* see UPROBE_REFCNT_GET below */
 	struct rw_semaphore	register_rwsem;
 	struct rw_semaphore	consumer_rwsem;
+	struct rcu_head		rcu;
 	struct list_head	pending_list;
 	struct uprobe_consumer	*consumers;
 	struct inode		*inode;		/* Also hold a ref to inode */
@@ -587,15 +588,138 @@ set_orig_insn(struct arch_uprobe *auprobe, struct mm_struct *mm, unsigned long v
 			*(uprobe_opcode_t *)&auprobe->insn);
 }
 
-static struct uprobe *get_uprobe(struct uprobe *uprobe)
+/*
+ * Uprobe's 64-bit refcount is actually two independent counters co-located in
+ * a single u64 value:
+ *   - lower 32 bits are just a normal refcount with is increment and
+ *   decremented on get and put, respectively, just like normal refcount
+ *   would;
+ *   - upper 32 bits are a tag (or epoch, if you will), which is always
+ *   incremented by one, no matter whether get or put operation is done.
+ *
+ * This upper counter is meant to distinguish between:
+ *   - one CPU dropping refcnt from 1 -> 0 and proceeding with "destruction",
+ *   - while another CPU continuing further meanwhile with 0 -> 1 -> 0 refcnt
+ *   sequence, also proceeding to "destruction".
+ *
+ * In both cases refcount drops to zero, but in one case it will have epoch N,
+ * while the second drop to zero will have a different epoch N + 2, allowing
+ * first destructor to bail out because epoch changed between refcount going
+ * to zero and put_uprobe() taking uprobes_treelock (under which overall
+ * 64-bit refcount is double-checked, see put_uprobe() for details).
+ *
+ * Lower 32-bit counter is not meant to over overflow, while it's expected
So refcount_t very explicitly handles both overflow and underflow and
screams bloody murder if they happen. Your thing does not.. 
+ * that upper 32-bit counter will overflow occasionally. Note, though, that we
+ * can't allow upper 32-bit counter to "bleed over" into lower 32-bit counter,
+ * so whenever epoch counter gets highest bit set to 1, __get_uprobe() and
+ * put_uprobe() will attempt to clear upper bit with cmpxchg(). This makes
+ * epoch effectively a 31-bit counter with highest bit used as a flag to
+ * perform a fix-up. This ensures epoch and refcnt parts do not "interfere".
+ *
+ * UPROBE_REFCNT_GET constant is chosen such that it will *increment both*
+ * epoch and refcnt parts atomically with one atomic_add().
+ * UPROBE_REFCNT_PUT is chosen such that it will *decrement* refcnt part and
+ * *increment* epoch part.
+ */
+#define UPROBE_REFCNT_GET ((1LL << 32) + 1LL) /* 0x0000000100000001LL */
+#define UPROBE_REFCNT_PUT ((1LL << 32) - 1LL) /* 0x00000000ffffffffLL */
+
+/*
+ * Caller has to make sure that:
+ *   a) either uprobe's refcnt is positive before this call;
+ *   b) or uprobes_treelock is held (doesn't matter if for read or write),
+ *      preventing uprobe's destructor from removing it from uprobes_tree.
+ *
+ * In the latter case, uprobe's destructor will "resurrect" uprobe instance if
+ * it detects that its refcount went back to being positive again inbetween it
+ * dropping to zero at some point and (potentially delayed) destructor
+ * callback actually running.
+ */
+static struct uprobe *__get_uprobe(struct uprobe *uprobe)
 {
-	refcount_inc(&uprobe->ref);
+	s64 v;
+
+	v = atomic64_add_return(UPROBE_REFCNT_GET, &uprobe->ref);
Distinct lack of u32 overflow testing here..
+
+	/*
+	 * If the highest bit is set, we need to clear it. If cmpxchg() fails,
+	 * we don't retry because there is another CPU that just managed to
+	 * update refcnt and will attempt the same "fix up". Eventually one of
+	 * them will succeed to clear highset bit.
+	 */
+	if (unlikely(v < 0))
+		(void)atomic64_cmpxchg(&uprobe->ref, v, v & ~(1ULL << 63));
+
 	return uprobe;
 }
 static void put_uprobe(struct uprobe *uprobe)
 {
-	if (refcount_dec_and_test(&uprobe->ref)) {
+	s64 v;
+
+	/*
+	 * here uprobe instance is guaranteed to be alive, so we use Tasks
+	 * Trace RCU to guarantee that uprobe won't be freed from under us, if
What's wrong with normal RCU?
+	 * we end up being a losing "destructor" inside uprobe_treelock'ed
+	 * section double-checking uprobe->ref value below.
+	 * Note call_rcu_tasks_trace() + uprobe_free_rcu below.
+	 */
+	rcu_read_lock_trace();
+
+	v = atomic64_add_return(UPROBE_REFCNT_PUT, &uprobe->ref);
No underflow handling... because nobody ever had a double put bug.
quoted hunk ↗ jump to hunk
+	if (unlikely((u32)v == 0)) {
+		bool destroy;
+
+		write_lock(&uprobes_treelock);
+		/*
+		 * We might race with find_uprobe()->__get_uprobe() executed
+		 * from inside read-locked uprobes_treelock, which can bump
+		 * refcount from zero back to one, after we got here. Even
+		 * worse, it's possible for another CPU to do 0 -> 1 -> 0
+		 * transition between this CPU doing atomic_add() and taking
+		 * uprobes_treelock. In either case this CPU should bail out
+		 * and not proceed with destruction.
+		 *
+		 * So now that we have exclusive write lock, we double check
+		 * the total 64-bit refcount value, which includes the epoch.
+		 * If nothing changed (i.e., epoch is the same and refcnt is
+		 * still zero), we are good and we proceed with the clean up.
+		 *
+		 * But if it managed to be updated back at least once, we just
+		 * pretend it never went to zero. If lower 32-bit refcnt part
+		 * drops to zero again, another CPU will proceed with
+		 * destruction, due to more up to date epoch.
+		 */
+		destroy = atomic64_read(&uprobe->ref) == v;
+		if (destroy && uprobe_is_active(uprobe))
+			rb_erase(&uprobe->rb_node, &uprobes_tree);
+		write_unlock(&uprobes_treelock);
+
+		/*
+		 * Beyond here we don't need RCU protection, we are either the
+		 * winning destructor and we control the rest of uprobe's
+		 * lifetime; or we lost and we are bailing without accessing
+		 * uprobe fields anymore.
+		 */
+		rcu_read_unlock_trace();
+
+		/* uprobe got resurrected, pretend we never tried to free it */
+		if (!destroy)
+			return;
+
 		/*
 		 * If application munmap(exec_vma) before uprobe_unregister()
 		 * gets called, we don't get a chance to remove uprobe from
@@ -604,8 +728,21 @@ static void put_uprobe(struct uprobe *uprobe)
 		mutex_lock(&delayed_uprobe_lock);
 		delayed_uprobe_remove(uprobe, NULL);
 		mutex_unlock(&delayed_uprobe_lock);
-		kfree(uprobe);
+
+		call_rcu_tasks_trace(&uprobe->rcu, uprobe_free_rcu);
+		return;
 	}
+
+	/*
+	 * If the highest bit is set, we need to clear it. If cmpxchg() fails,
+	 * we don't retry because there is another CPU that just managed to
+	 * update refcnt and will attempt the same "fix up". Eventually one of
+	 * them will succeed to clear highset bit.
+	 */
+	if (unlikely(v < 0))
+		(void)atomic64_cmpxchg(&uprobe->ref, v, v & ~(1ULL << 63));
+
+	rcu_read_unlock_trace();
 }
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help