Thread (25 messages) 25 messages, 3 authors, 16h ago

[PATCH v3 bpf-next 06/11] bpf: tcp: Make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive.

From: Kuniyuki Iwashima <kuniyu@google.com>
Date: 2026-05-23 08:30:09
Also in: netdev
Subsystem: bpf [general] (safe dynamic programs and tools), bpf [l7 framework] (sockmap), bpf [networking] (tcx & tc bpf, sock_addr), networking [general], networking [tcp], the rest · Maintainers: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi, John Fastabend, Jakub Sitnicki, Martin KaFai Lau, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Neal Cardwell, Linus Torvalds

Both BPF_SOCK_OPS_RCVQ_CB and SOCKMAP can intercept and handle
socket receive queues, leading to overlapping use cases.

While BPF_SOCK_OPS_RCVQ_CB focuses on optimizing single-socket
performance by reducing EPOLLIN wakeups and fully preserves TCP
zerocopy support, SOCKMAP is designed to facilitate multi-socket
routing at the cost of higher overhead and no zerocopy support.

Enabling both features on the same socket makes no sense and
results in unexpected interference between them.

For instance, SOCKMAP calls __tcp_cleanup_rbuf(), where we will
add a BPF_SOCK_OPS_RCVQ_CB hook, and bpf_sock_ops_tcp_set_rcvlowat()
calls sk->sk_data_ready(), which would trigger SOCKMAP.

Let's make BPF_SOCK_OPS_RCVQ_CB and SOCKMAP mutually exclusive.

Note that it requires write_lock_bh(&sk->sk_callback_lock) to
synchronise with tcp_bpf_update_proto() and check if sk->sk_prot
is one of tcp_bpf_prots[][] because sock_map_update_elem() only
holds bh_lock_sock() without checking sock_owned_by_user().

Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
v3: Check sk->sk_prot and update tp->bpf_sock_ops_cb_flags
    under sk->sk_callback_lock, and only when not flagged yet.
---
 include/net/tcp.h  |  1 +
 net/core/filter.c  | 35 +++++++++++++++++++++++++++++++----
 net/ipv4/tcp_bpf.c | 12 ++++++++++++
 3 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/include/net/tcp.h b/include/net/tcp.h
index c6a6853909c4..bc95d8e7b62e 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -2853,6 +2853,7 @@ struct sk_msg;
 struct sk_psock;
 
 #ifdef CONFIG_BPF_SYSCALL
+bool tcp_in_sockmap(const struct sock *sk);
 int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore);
 void tcp_bpf_clone(const struct sock *sk, struct sock *newsk);
 #ifdef CONFIG_BPF_STREAM_PARSER
diff --git a/net/core/filter.c b/net/core/filter.c
index 3608036632a8..1fb63b264b18 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -5382,12 +5382,34 @@ static int bpf_sol_tcp_getsockopt(struct sock *sk, int optname,
 	return 0;
 }
 
+static int __bpf_sock_ops_cb_flags_set(struct sock *sk, int val)
+{
+	if (!(val & BPF_SOCK_OPS_RCVQ_CB_FLAG) ||
+	    tcp_sk(sk)->bpf_sock_ops_cb_flags & BPF_SOCK_OPS_RCVQ_CB_FLAG) {
+		tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+		return 0;
+	}
+
+	write_lock_bh(&sk->sk_callback_lock);
+
+	if (unlikely(tcp_in_sockmap(sk))) {
+		write_unlock_bh(&sk->sk_callback_lock);
+		return -EBUSY;
+	}
+
+	tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+
+	write_unlock_bh(&sk->sk_callback_lock);
+
+	return 0;
+}
+
 static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 				  char *optval, int optlen)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	unsigned long timeout;
-	int val;
+	int val, err;
 
 	if (optlen != sizeof(int))
 		return -EINVAL;
@@ -5424,7 +5446,9 @@ static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname,
 	case TCP_BPF_SOCK_OPS_CB_FLAGS:
 		if (val & ~(BPF_SOCK_OPS_ALL_CB_FLAGS))
 			return -EINVAL;
-		tp->bpf_sock_ops_cb_flags = val;
+		err = __bpf_sock_ops_cb_flags_set(sk, val);
+		if (err)
+			return err;
 		break;
 	default:
 		return -EINVAL;
@@ -5999,8 +6023,9 @@ static const struct bpf_func_proto bpf_sock_ops_getsockopt_proto = {
 BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
 	   int, argval)
 {
-	struct sock *sk = bpf_sock->sk;
 	int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS;
+	struct sock *sk = bpf_sock->sk;
+	int err;
 
 	if (!is_locked_tcp_sock_ops(bpf_sock) &&
 	    bpf_sock->op != BPF_SOCK_OPS_RCVQ_CB)
@@ -6009,7 +6034,9 @@ BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock,
 	if (!IS_ENABLED(CONFIG_INET) || !sk_fullsock(sk))
 		return -EINVAL;
 
-	tcp_sk(sk)->bpf_sock_ops_cb_flags = val;
+	err = __bpf_sock_ops_cb_flags_set(sk, val);
+	if (err)
+		return err;
 
 	return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS);
 }
diff --git a/net/ipv4/tcp_bpf.c b/net/ipv4/tcp_bpf.c
index cc0bd73f36b6..7e7966b095f9 100644
--- a/net/ipv4/tcp_bpf.c
+++ b/net/ipv4/tcp_bpf.c
@@ -705,6 +705,16 @@ int tcp_bpf_strp_read_sock(struct strparser *strp, read_descriptor_t *desc,
 }
 #endif /* CONFIG_BPF_STREAM_PARSER */
 
+bool tcp_in_sockmap(const struct sock *sk)
+{
+	const struct proto *prot = sk->sk_prot;
+
+	lockdep_assert_held(&sk->sk_callback_lock);
+
+	return &tcp_bpf_prots[0][0] <= prot &&
+		prot <= &tcp_bpf_prots[TCP_BPF_NUM_PROTS - 1][TCP_BPF_NUM_CFGS - 1];
+}
+
 int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 {
 	int family = sk->sk_family == AF_INET6 ? TCP_BPF_IPV6 : TCP_BPF_IPV4;
@@ -729,6 +739,8 @@ int tcp_bpf_update_proto(struct sock *sk, struct sk_psock *psock, bool restore)
 			sock_replace_proto(sk, psock->sk_proto);
 		}
 		return 0;
+	} else if (BPF_SOCK_OPS_TEST_FLAG(tcp_sk(sk), BPF_SOCK_OPS_RCVQ_CB_FLAG)) {
+		return -EBUSY;
 	}
 
 	if (sk->sk_family == AF_INET6) {
-- 
2.54.0.746.g67dd491aae-goog
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help