Thread (22 messages) 22 messages, 7 authors, 2022-08-05
STALE1386d REVIEWED: 1 (1M)

[PATCH v2] net/smc: fix refcount bug in sk_psock_get (2)

From: Hawkins Jiawei <hidden>
Date: 2022-07-30 08:59:06
Also in: bpf, linux-kernel-mentees, linux-s390, lkml
Subsystem: bpf [l7 framework] (sockmap), networking [general], networking [sockets], shared memory communications (smc) sockets, the rest · Maintainers: John Fastabend, Jakub Sitnicki, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Kuniyuki Iwashima, Willem de Bruijn, D. Wythe, Dust Li, Sidraya Jayagond, Wenjia Zhang, Linus Torvalds

Syzkaller reports refcount bug as follows:
------------[ cut here ]------------
refcount_t: saturated; leaking memory.
WARNING: CPU: 1 PID: 3605 at lib/refcount.c:19 refcount_warn_saturate+0xf4/0x1e0 lib/refcount.c:19
Modules linked in:
CPU: 1 PID: 3605 Comm: syz-executor208 Not tainted 5.18.0-syzkaller-03023-g7e062cda7d90 #0
...
Call Trace:
 <TASK>
 __refcount_add_not_zero include/linux/refcount.h:163 [inline]
 __refcount_inc_not_zero include/linux/refcount.h:227 [inline]
 refcount_inc_not_zero include/linux/refcount.h:245 [inline]
 sk_psock_get+0x3bc/0x410 include/linux/skmsg.h:439
 tls_data_ready+0x6d/0x1b0 net/tls/tls_sw.c:2091
 tcp_data_ready+0x106/0x520 net/ipv4/tcp_input.c:4983
 tcp_data_queue+0x25f2/0x4c90 net/ipv4/tcp_input.c:5057
 tcp_rcv_state_process+0x1774/0x4e80 net/ipv4/tcp_input.c:6659
 tcp_v4_do_rcv+0x339/0x980 net/ipv4/tcp_ipv4.c:1682
 sk_backlog_rcv include/net/sock.h:1061 [inline]
 __release_sock+0x134/0x3b0 net/core/sock.c:2849
 release_sock+0x54/0x1b0 net/core/sock.c:3404
 inet_shutdown+0x1e0/0x430 net/ipv4/af_inet.c:909
 __sys_shutdown_sock net/socket.c:2331 [inline]
 __sys_shutdown_sock net/socket.c:2325 [inline]
 __sys_shutdown+0xf1/0x1b0 net/socket.c:2343
 __do_sys_shutdown net/socket.c:2351 [inline]
 __se_sys_shutdown net/socket.c:2349 [inline]
 __x64_sys_shutdown+0x50/0x70 net/socket.c:2349
 do_syscall_x64 arch/x86/entry/common.c:50 [inline]
 do_syscall_64+0x35/0xb0 arch/x86/entry/common.c:80
 entry_SYSCALL_64_after_hwframe+0x46/0xb0
 </TASK>

During SMC fallback process in connect syscall, kernel will
replaces TCP with SMC. In order to forward wakeup
smc socket waitqueue after fallback, kernel will sets
clcsk->sk_user_data to origin smc socket in
smc_fback_replace_callbacks().

Later, in shutdown syscall, kernel will calls
sk_psock_get(), which treats the clcsk->sk_user_data
as sk_psock type, triggering the refcnt warning.

So, the root cause is that smc and psock, both will use
sk_user_data field. So they will mismatch this field
easily.

This patch solves it by using another bit(defined as
SK_USER_DATA_NOTPSOCK) in PTRMASK, to mark whether
sk_user_data points to a sk_psock object or not.
This patch depends on a PTRMASK introduced in commit f1ff5ce2cd5e
("net, sk_msg: Clear sk_user_data pointer on clone if tagged").

Fixes: 341adeec9ada ("net/smc: Forward wakeup to smc socket waitqueue after fallback")
Fixes: a60a2b1e0af1 ("net/smc: reduce active tcp_listen workers")
Reported-and-tested-by: syzbot+5f26f85569bd179c18ce@syzkaller.appspotmail.com
Suggested-by: Jakub Kicinski <kuba@kernel.org>
Acked-by: Wen Gu <guwen@linux.alibaba.com>
Signed-off-by: Hawkins Jiawei <redacted>
---
v1 -> v2: 
  - add bit in PTRMASK to patch the bug

 include/linux/skmsg.h |  2 +-
 include/net/sock.h    | 27 +++++++++++++++++++++++++--
 net/smc/af_smc.c      |  6 ++++--
 net/smc/smc.h         |  2 +-
 4 files changed, 31 insertions(+), 6 deletions(-)
diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
index c5a2d6f50f25..81bfa1a33623 100644
--- a/include/linux/skmsg.h
+++ b/include/linux/skmsg.h
@@ -277,7 +277,7 @@ static inline void sk_msg_sg_copy_clear(struct sk_msg *msg, u32 start)
 
 static inline struct sk_psock *sk_psock(const struct sock *sk)
 {
-	return rcu_dereference_sk_user_data(sk);
+	return rcu_dereference_sk_user_data_psock(sk);
 }
 
 static inline void sk_psock_set_state(struct sk_psock *psock,
diff --git a/include/net/sock.h b/include/net/sock.h
index 9fa54762e077..316c0313b2bf 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -549,10 +549,17 @@ enum sk_pacing {
  * when cloning the socket. For instance, it can point to a reference
  * counted object. sk_user_data bottom bit is set if pointer must not
  * be copied.
+ *
+ * SK_USER_DATA_NOCOPY   - test if pointer must not copied
+ * SK_USER_DATA_BPF      - managed by BPF
+ * SK_USER_DATA_NOTPSOCK - test if pointer points to psock
  */
 #define SK_USER_DATA_NOCOPY	1UL
-#define SK_USER_DATA_BPF	2UL	/* Managed by BPF */
-#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF)
+#define SK_USER_DATA_BPF	2UL
+#define SK_USER_DATA_NOTPSOCK	4UL
+#define SK_USER_DATA_PTRMASK	~(SK_USER_DATA_NOCOPY | SK_USER_DATA_BPF |\
+				  SK_USER_DATA_NOTPSOCK)
+
 
 /**
  * sk_user_data_is_nocopy - Test if sk_user_data pointer must not be copied
@@ -584,6 +591,22 @@ static inline bool sk_user_data_is_nocopy(const struct sock *sk)
 			   __tmp | SK_USER_DATA_NOCOPY);		\
 })
 
+/**
+ * rcu_dereference_sk_user_data_psock - return psock if sk_user_data points
+ * to the psock
+ * @sk: socket
+ */
+static inline
+struct sk_psock *rcu_dereference_sk_user_data_psock(const struct sock *sk)
+{
+	uintptr_t __tmp = (uintptr_t)rcu_dereference(__sk_user_data((sk)));
+
+	if (__tmp & SK_USER_DATA_NOTPSOCK)
+		return NULL;
+	return (struct sk_psock *)(__tmp & SK_USER_DATA_PTRMASK);
+}
+
+
 static inline
 struct net *sock_net(const struct sock *sk)
 {
diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
index 433bb5a7df31..d0feccf824c8 100644
--- a/net/smc/af_smc.c
+++ b/net/smc/af_smc.c
@@ -812,7 +812,8 @@ static void smc_fback_replace_callbacks(struct smc_sock *smc)
 	struct sock *clcsk = smc->clcsock->sk;
 
 	write_lock_bh(&clcsk->sk_callback_lock);
-	clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+	clcsk->sk_user_data = (void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY |
+				       SK_USER_DATA_NOTPSOCK);
 
 	smc_clcsock_replace_cb(&clcsk->sk_state_change, smc_fback_state_change,
 			       &smc->clcsk_state_change);
@@ -2470,7 +2471,8 @@ static int smc_listen(struct socket *sock, int backlog)
 	 */
 	write_lock_bh(&smc->clcsock->sk->sk_callback_lock);
 	smc->clcsock->sk->sk_user_data =
-		(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY);
+		(void *)((uintptr_t)smc | SK_USER_DATA_NOCOPY |
+			 SK_USER_DATA_NOTPSOCK);
 	smc_clcsock_replace_cb(&smc->clcsock->sk->sk_data_ready,
 			       smc_clcsock_data_ready, &smc->clcsk_data_ready);
 	write_unlock_bh(&smc->clcsock->sk->sk_callback_lock);
diff --git a/net/smc/smc.h b/net/smc/smc.h
index 5ed765ea0c73..c24d0469d267 100644
--- a/net/smc/smc.h
+++ b/net/smc/smc.h
@@ -299,7 +299,7 @@ static inline void smc_init_saved_callbacks(struct smc_sock *smc)
 static inline struct smc_sock *smc_clcsock_user_data(const struct sock *clcsk)
 {
 	return (struct smc_sock *)
-	       ((uintptr_t)clcsk->sk_user_data & ~SK_USER_DATA_NOCOPY);
+	       ((uintptr_t)clcsk->sk_user_data & SK_USER_DATA_PTRMASK);
 }
 
 /* save target_cb in saved_cb, and replace target_cb with new_cb */
-- 
2.25.1
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help