[PATCH net-next v8 2/3] sock: add MSG_ZEROCOPY notification mechanism based on msg_control
From: <hidden>
Date: 2024-07-30 18:41:54
Also in:
linux-kselftest, netdev
Subsystem:
alpha port, generic include/asm header files, mips, networking [general], networking [sockets], parisc architecture, sparc + ultrasparc (sparc/sparc64), the rest · Maintainers:
Richard Henderson, Matt Turner, Magnus Lindholm, Arnd Bergmann, Thomas Bogendoerfer, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Kuniyuki Iwashima, Willem de Bruijn, "James E.J. Bottomley", Helge Deller, Andreas Larsson, Linus Torvalds
From: Zijian Zhang <redacted> The MSG_ZEROCOPY flag enables copy avoidance for socket send calls. However, zerocopy is not a free lunch. Apart from the management of user pages, the combination of poll + recvmsg to receive notifications incurs unignorable overhead in the applications. We try to mitigate this overhead with a new notification mechanism based on msg_control. Leveraging the general framework to copy cmsgs to the user space, we copy zerocopy notifications to the user upon returning of sendmsgs. Signed-off-by: Zijian Zhang <redacted> Signed-off-by: Xiaochun Lu <redacted> --- arch/alpha/include/uapi/asm/socket.h | 2 + arch/mips/include/uapi/asm/socket.h | 2 + arch/parisc/include/uapi/asm/socket.h | 2 + arch/sparc/include/uapi/asm/socket.h | 2 + include/linux/socket.h | 2 +- include/uapi/asm-generic/socket.h | 2 + include/uapi/linux/socket.h | 23 +++++++++ net/core/sock.c | 72 +++++++++++++++++++++++++-- 8 files changed, 102 insertions(+), 5 deletions(-)
diff --git a/arch/alpha/include/uapi/asm/socket.h b/arch/alpha/include/uapi/asm/socket.h
index e94f621903fe..7c32d9dbe47f 100644
--- a/arch/alpha/include/uapi/asm/socket.h
+++ b/arch/alpha/include/uapi/asm/socket.h@@ -140,6 +140,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SCM_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64
diff --git a/arch/mips/include/uapi/asm/socket.h b/arch/mips/include/uapi/asm/socket.h
index 60ebaed28a4c..3f7fade998cb 100644
--- a/arch/mips/include/uapi/asm/socket.h
+++ b/arch/mips/include/uapi/asm/socket.h@@ -151,6 +151,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SCM_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64
diff --git a/arch/parisc/include/uapi/asm/socket.h b/arch/parisc/include/uapi/asm/socket.h
index be264c2b1a11..77f5bee0fdc9 100644
--- a/arch/parisc/include/uapi/asm/socket.h
+++ b/arch/parisc/include/uapi/asm/socket.h@@ -132,6 +132,8 @@ #define SO_PASSPIDFD 0x404A #define SO_PEERPIDFD 0x404B +#define SCM_ZC_NOTIFICATION 0x404C + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64
diff --git a/arch/sparc/include/uapi/asm/socket.h b/arch/sparc/include/uapi/asm/socket.h
index 682da3714686..eb44fc515b45 100644
--- a/arch/sparc/include/uapi/asm/socket.h
+++ b/arch/sparc/include/uapi/asm/socket.h@@ -133,6 +133,8 @@ #define SO_PASSPIDFD 0x0055 #define SO_PEERPIDFD 0x0056 +#define SCM_ZC_NOTIFICATION 0x0057 + #if !defined(__KERNEL__)
diff --git a/include/linux/socket.h b/include/linux/socket.h
index 40173c919d0f..71e3c6ebfed5 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h@@ -171,7 +171,7 @@ static inline struct cmsghdr * cmsg_nxthdr (struct msghdr *__msg, struct cmsghdr static inline bool cmsg_copy_to_user(struct cmsghdr *__cmsg) { - return 0; + return __cmsg->cmsg_type == SCM_ZC_NOTIFICATION; } static inline size_t msg_data_left(struct msghdr *msg)
diff --git a/include/uapi/asm-generic/socket.h b/include/uapi/asm-generic/socket.h
index 8ce8a39a1e5f..02e9159c7944 100644
--- a/include/uapi/asm-generic/socket.h
+++ b/include/uapi/asm-generic/socket.h@@ -135,6 +135,8 @@ #define SO_PASSPIDFD 76 #define SO_PEERPIDFD 77 +#define SCM_ZC_NOTIFICATION 78 + #if !defined(__KERNEL__) #if __BITS_PER_LONG == 64 || (defined(__x86_64__) && defined(__ILP32__))
diff --git a/include/uapi/linux/socket.h b/include/uapi/linux/socket.h
index d3fcd3b5ec53..b5b5fa9febb1 100644
--- a/include/uapi/linux/socket.h
+++ b/include/uapi/linux/socket.h@@ -2,6 +2,8 @@ #ifndef _UAPI_LINUX_SOCKET_H #define _UAPI_LINUX_SOCKET_H +#include <linux/types.h> + /* * Desired design of maximum size and alignment (see RFC2553) */
@@ -35,4 +37,25 @@ struct __kernel_sockaddr_storage { #define SOCK_TXREHASH_DISABLED 0 #define SOCK_TXREHASH_ENABLED 1 +#define ZC_NOTIFICATION_MAX 16 + +/* + * A zc_info_elem represents a completion notification for sendmsgs in range + * lo to high, zerocopy represents whether the underlying transmission is + * zerocopy or not. + */ +struct zc_info_elem { + __u32 lo; + __u32 hi; + __u8 zerocopy; +}; + +/* + * zc_info is the struct used for the SCM_ZC_NOTIFICATION control message. + */ +struct zc_info { + __u32 size; /* size of the zc_info_elem arr */ + struct zc_info_elem arr[]; +}; + #endif /* _UAPI_LINUX_SOCKET_H */
diff --git a/net/core/sock.c b/net/core/sock.c
index b2cbe753af1d..37b1b12623ee 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c@@ -1481,10 +1481,12 @@ int sk_setsockopt(struct sock *sk, int level, int optname, ret = -EOPNOTSUPP; } if (!ret) { - if (val < 0 || val > 1) + if (val < 0 || val > 1) { ret = -EINVAL; - else + } else { sock_valbool_flag(sk, SOCK_ZEROCOPY, valbool); + static_branch_enable(&tx_copy_cmsg_to_user_key); + } } break;
@@ -2826,8 +2828,8 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, } EXPORT_SYMBOL(sock_alloc_send_pskb); -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg __always_unused, - struct cmsghdr *cmsg, struct sockcm_cookie *sockc) +int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, + struct sockcm_cookie *sockc) { u32 tsflags;
@@ -2863,6 +2865,68 @@ int __sock_cmsg_send(struct sock *sk, struct msghdr *msg __always_unused, case SCM_RIGHTS: case SCM_CREDENTIALS: break; + case SCM_ZC_NOTIFICATION: { + struct zc_info *zc = CMSG_DATA(cmsg); + struct sk_buff_head *q, local_q; + int cmsg_data_len, i = 0; + unsigned long flags; + struct sk_buff *skb; + + if (!sock_flag(sk, SOCK_ZEROCOPY) || sk->sk_family == PF_RDS) + return -EINVAL; + + cmsg_data_len = cmsg->cmsg_len - sizeof(struct cmsghdr); + if (cmsg_data_len < sizeof(struct zc_info)) + return -EINVAL; + + if (zc->size > ZC_NOTIFICATION_MAX || + (cmsg_data_len - sizeof(struct zc_info)) != + (zc->size * sizeof(struct zc_info_elem))) + return -EINVAL; + + q = &sk->sk_error_queue; + skb_queue_head_init(&local_q); + + /* Get zerocopy error messages from sk_error_queue, and add them + * to a local queue for later processing. This minimizes the + * code while the spinlock is held and irq is disabled. + */ + spin_lock_irqsave(&q->lock, flags); + skb = skb_peek(q); + while (skb && i < zc->size) { + struct sk_buff *skb_next = skb_peek_next(skb, q); + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + if (serr->ee.ee_errno != 0 || + serr->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY) { + skb = skb_next; + continue; + } + + __skb_unlink(skb, q); + __skb_queue_tail(&local_q, skb); + skb = skb_next; + i++; + } + spin_unlock_irqrestore(&q->lock, flags); + + i = 0; + while ((skb = skb_peek(&local_q)) != NULL) { + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + zc->arr[i].hi = serr->ee.ee_data; + zc->arr[i].lo = serr->ee.ee_info; + zc->arr[i].zerocopy = !(serr->ee.ee_code + & SO_EE_CODE_ZEROCOPY_COPIED); + __skb_unlink(skb, &local_q); + consume_skb(skb); + i++; + } + + zc->size = i; + msg->msg_control_copy_to_user = true; + break; + } default: return -EINVAL; }
--
2.20.1