Thread (8 messages) 8 messages, 2 authors, 2025-05-22
STALE383d

[PATCH net-next v2 1/6] tcp: support writing to a socket in listening state

From: Jeremy Harris <hidden>
Date: 2025-05-21 11:49:44
Also in: netdev
Subsystem: networking [general], networking [sockets], networking [tcp], performance events subsystem, the rest · Maintainers: "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Kuniyuki Iwashima, Willem de Bruijn, Neal Cardwell, Peter Zijlstra, Ingo Molnar, Arnaldo Carvalho de Melo, Namhyung Kim, Linus Torvalds

    In the tcp sendmsg handler, permit a write in LISTENING state if
    a MSG_PRELOAD flag is used.  Copy from iovec to a linear sk_buff
    for placement on the socket write queue.

Signed-off-by: Jeremy Harris <redacted>
---
 include/linux/socket.h                        |   1 +
 net/ipv4/tcp.c                                | 115 ++++++++++++++++++
 .../perf/trace/beauty/include/linux/socket.h  |   1 +
 tools/perf/trace/beauty/msg_flags.c           |   3 +
 4 files changed, 120 insertions(+)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 3b262487ec06..b41f4cd4dc97 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -330,6 +330,7 @@ struct ucred {
 #define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_SPLICE_PAGES 0x8000000	/* Splice the pages from the iterator in sendmsg() */
+#define MSG_PRELOAD	0x10000000	/* Preload tx data while listening */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
 					   descriptor received through
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index b7b6ab41b496..9a5daf27c980 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1057,6 +1057,118 @@ int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied,
 	return err;
 }
 
+/* Cut-down version of tcp_sendmsg_locked(), for writing on a listen socket
+ */
+static int tcp_sendmsg_preload(struct sock *sk, struct msghdr *msg)
+{
+	struct sk_buff *skb;
+	struct sockcm_cookie sockc;
+	int flags, err, copied = 0;
+	int size_goal;
+	int process_backlog = 0;
+	long timeo;
+
+	if (sk->sk_state != TCP_LISTEN)
+		return -EINVAL;
+
+	flags = msg->msg_flags;
+
+	sockc = (struct sockcm_cookie){ .tsflags = READ_ONCE(sk->sk_tsflags) };
+
+	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+
+	/* Ok commence sending. */
+restart:
+	/* Use a arbitrary "mss" value */
+	size_goal = 1000;
+
+	err = -EPIPE;
+	if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
+		goto do_error;
+
+	while (msg_data_left(msg)) {
+		ssize_t copy = 0;
+
+		skb = tcp_write_queue_tail(sk);
+		if (skb)
+			copy = size_goal - skb->len;
+
+		trace_tcp_sendmsg_locked(sk, msg, skb, size_goal);
+
+		if (copy <= 0 || !tcp_skb_can_collapse_to(skb)) {
+			bool first_skb = !skb;
+
+			/* Limit to only one skb on the sk write queue */
+
+			if (!first_skb)
+				goto out_nopush;
+
+			if (!sk_stream_memory_free(sk))
+				goto wait_for_space;
+
+			if (unlikely(process_backlog >= 16)) {
+				process_backlog = 0;
+				if (sk_flush_backlog(sk))
+					goto restart;
+			}
+
+			skb = tcp_stream_alloc_skb(sk, sk->sk_allocation,
+						   first_skb);
+			if (!skb)
+				goto wait_for_space;
+
+			process_backlog++;
+
+#ifdef CONFIG_SKB_DECRYPTED
+			skb->decrypted = !!(flags & MSG_SENDPAGE_DECRYPTED);
+#endif
+			tcp_skb_entail(sk, skb);
+			copy = size_goal;
+		}
+
+		/* Try to append data to the end of skb. */
+		if (copy > msg_data_left(msg))
+			copy = msg_data_left(msg);
+
+		copy = min_t(int, copy, skb_tailroom(skb));
+		err = skb_add_data_nocache(sk, skb, &msg->msg_iter, copy);
+		if (err)
+			goto do_error;
+
+		TCP_SKB_CB(skb)->end_seq += copy;
+		tcp_skb_pcount_set(skb, 0);
+
+		copied += copy;
+		goto out_nopush;
+
+wait_for_space:
+		set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+		tcp_remove_empty_skb(sk);
+
+		err = sk_stream_wait_memory(sk, &timeo);
+		if (err != 0)
+			goto do_error;
+	}
+
+out_nopush:
+	return copied;
+
+do_error:
+	tcp_remove_empty_skb(sk);
+
+	if (copied)
+		goto out_nopush;
+
+	err = sk_stream_error(sk, flags, err);
+	/* make sure we wake any epoll edge trigger waiter */
+	if (unlikely(tcp_rtx_and_write_queues_empty(sk) && err == -EAGAIN)) {
+		sk->sk_write_space(sk);
+		tcp_chrono_stop(sk, TCP_CHRONO_SNDBUF_LIMITED);
+	}
+
+	return err;
+}
+
 int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 {
 	struct net_devmem_dmabuf_binding *binding = NULL;
@@ -1132,6 +1244,9 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
 			goto out_err;
 	}
 
+	if (unlikely(flags & MSG_PRELOAD))
+		return tcp_sendmsg_preload(sk, msg);
+
 	timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
 
 	tcp_rate_check_app_limited(sk);  /* is sending application-limited? */
diff --git a/tools/perf/trace/beauty/include/linux/socket.h b/tools/perf/trace/beauty/include/linux/socket.h
index c3322eb3d686..e9ea498169f3 100644
--- a/tools/perf/trace/beauty/include/linux/socket.h
+++ b/tools/perf/trace/beauty/include/linux/socket.h
@@ -330,6 +330,7 @@ struct ucred {
 #define MSG_SOCK_DEVMEM 0x2000000	/* Receive devmem skbs as cmsg */
 #define MSG_ZEROCOPY	0x4000000	/* Use user data in kernel path */
 #define MSG_SPLICE_PAGES 0x8000000	/* Splice the pages from the iterator in sendmsg() */
+#define MSG_PRELOAD	0x10000000	/* Preload tx data while listening */
 #define MSG_FASTOPEN	0x20000000	/* Send data in TCP SYN */
 #define MSG_CMSG_CLOEXEC 0x40000000	/* Set close_on_exec for file
 					   descriptor received through
diff --git a/tools/perf/trace/beauty/msg_flags.c b/tools/perf/trace/beauty/msg_flags.c
index 2da581ff0c80..27e40da9b02d 100644
--- a/tools/perf/trace/beauty/msg_flags.c
+++ b/tools/perf/trace/beauty/msg_flags.c
@@ -20,6 +20,9 @@
 #ifndef MSG_SPLICE_PAGES
 #define MSG_SPLICE_PAGES	0x8000000
 #endif
+#ifndef MSG_PRELOAD
+#define MSG_PRELOAD		0x10000000
+#endif
 #ifndef MSG_FASTOPEN
 #define MSG_FASTOPEN		0x20000000
 #endif
-- 
2.49.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help