Thread (12 messages) 12 messages, 3 authors, 2018-11-30

Re: [PATCH net-next v2 1/2] udp: msg_zerocopy

From: Paolo Abeni <pabeni@redhat.com>
Date: 2018-11-29 19:32:12
Subsystem: networking [general], networking [ipv4/ipv6], the rest · Maintainers: "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, David Ahern, Ido Schimmel, Linus Torvalds

Hi,

Thank you for the update!

On Wed, 2018-11-28 at 18:50 -0500, Willem de Bruijn wrote:
I did revert to the basic implementation using an extra ref
for the function call, similar to TCP, as you suggested.

On top of that as a separate optimization patch I have a
variant that uses refcnt zero by replacing refcount_inc with
refcount_set(.., refcount_read(..) + 1). Not very pretty.
If the skb/uarg is not shared (no other threads can touch the refcnt)
before ip*_append_data() completes, how about something like the
following (incremental diff on top of patch 1/2, untested, uncompiled,
just to give the idea):

---
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 04f52e719571..1e3d195ffdfb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -480,6 +480,13 @@ static inline void sock_zerocopy_get(struct ubuf_info *uarg)
 	refcount_inc(&uarg->refcnt);
 }
 
+/* use only before uarg is actually shared */
+static inline void __sock_zerocopy_init(struct ubuf_info *uarg, int cnt)
+{
+	if (uarg)
+		refcount_set(&uarg->refcnt, cnt);
+}
+
 void sock_zerocopy_put(struct ubuf_info *uarg);
 void sock_zerocopy_put_abort(struct ubuf_info *uarg);
 
@@ -1326,13 +1333,20 @@ static inline struct ubuf_info *skb_zcopy(struct sk_buff *skb)
 	return is_zcopy ? skb_uarg(skb) : NULL;
 }
 
-static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+static inline int __skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
 {
 	if (skb && uarg && !skb_zcopy(skb)) {
-		sock_zerocopy_get(uarg);
 		skb_shinfo(skb)->destructor_arg = uarg;
 		skb_shinfo(skb)->tx_flags |= SKBTX_ZEROCOPY_FRAG;
+		return 1;
 	}
+	return 0;
+}
+
+static inline void skb_zcopy_set(struct sk_buff *skb, struct ubuf_info *uarg)
+{
+	if (__skb_zcopy_set(skb, uarg))
+		sock_zerocopy_get(uarg);
 }
 
 static inline void skb_zcopy_set_nouarg(struct sk_buff *skb, void *val)
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 2179ef84bb44..435bac91d293 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -957,7 +957,7 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
 	uarg->len = 1;
 	uarg->bytelen = size;
 	uarg->zerocopy = 1;
-	refcount_set(&uarg->refcnt, sk->sk_type == SOCK_STREAM ? 1 : 0);
+	refcount_set(&uarg->refcnt, 1);
 	sock_hold(sk);
 
 	return uarg;
@@ -1097,13 +1097,6 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
 		atomic_dec(&sk->sk_zckey);
 		uarg->len--;
 
-		/* Stream socks hold a ref for the syscall, as skbs can be sent
-		 * and freed inside the loop, dropping refcnt to 0 inbetween.
-		 * Datagrams do not need this, but sock_zerocopy_put expects it.
-		 */
-		if (sk->sk_type != SOCK_STREAM && !refcount_read(&uarg->refcnt))
-			refcount_set(&uarg->refcnt, 1);
-
 		sock_zerocopy_put(uarg);
 	}
 }
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 7504da2f33d6..d3285613d87a 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -882,6 +882,7 @@ static int __ip_append_data(struct sock *sk,
 	struct rtable *rt = (struct rtable *)cork->dst;
 	unsigned int wmem_alloc_delta = 0;
 	u32 tskey = 0;
+	int uarg_refs = 0;
 	bool paged;
 
 	skb = skb_peek_tail(queue);
@@ -919,6 +920,7 @@ static int __ip_append_data(struct sock *sk,
 
 	if (flags & MSG_ZEROCOPY && length) {
 		uarg = sock_zerocopy_realloc(sk, length, skb_zcopy(skb));
+		uarg_refs = 1;
 		if (!uarg)
 			return -ENOBUFS;
 		if (rt->dst.dev->features & NETIF_F_SG &&
@@ -926,7 +928,7 @@ static int __ip_append_data(struct sock *sk,
 			paged = true;
 		} else {
 			uarg->zerocopy = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 		}
 	}
 
@@ -1019,7 +1021,7 @@ static int __ip_append_data(struct sock *sk,
 			cork->tx_flags = 0;
 			skb_shinfo(skb)->tskey = tskey;
 			tskey = 0;
-			skb_zcopy_set(skb, uarg);
+			uarg_refs += __skb_zcopy_set(skb, uarg);
 
 			/*
 			 *	Find where to start putting bytes.
@@ -1121,6 +1123,7 @@ static int __ip_append_data(struct sock *sk,
 		length -= copy;
 	}
 
+	__sock_zerocopy_init(uarg, uarg_refs);
 	if (wmem_alloc_delta)
 		refcount_add(wmem_alloc_delta, &sk->sk_wmem_alloc);
 	return 0;
@@ -1128,6 +1131,7 @@ static int __ip_append_data(struct sock *sk,
 error_efault:
 	err = -EFAULT;
 error:
+	__sock_zerocopy_init(uarg, uarg_refs);
 	sock_zerocopy_put_abort(uarg);
 	cork->length -= length;
 	IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTDISCARDS);
---
The basic idea is using the same schema currently used for wmem
accounting: do the book-keeping inside the loop and set the atomic
reference counter only once at the end of the loop.

WDYT?

Thanks,

Paolo
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help