Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via... | netdev

Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs

From: Jonas Köppeler <hidden>
Date: 2026-06-30 14:03:38
Also in: bpf, lkml

On 6/13/26 4:14 PM, Simon Schippers wrote:

On 6/12/26 10:35, hawk@kernel.org wrote:

quoted

From: Simon Schippers <redacted>

Per-packet BQL completion forces DQL to converge on limit=2, causing
excessive NAPI scheduling overhead and qdisc requeues.

Accumulate BQL completions and flush them when a configurable time
threshold (tx-usecs) is exceeded, letting DQL discover a limit that
bounds actual queuing delay to the configured interval. Coalescing
state persists across NAPI polls in struct veth_rq so completions can
accumulate beyond a single budget=64 cycle.

The flush condition is:

state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit

Flushing when n_bql exceeds dql.limit handles BQL starvation.

The comparison is strictly greater-than because netdev_tx_sent_queue()
always lets the producer exceed the limit by one before it stops, so
n_bql == dql.limit is a normal in-flight state. dql.limit lives in
the same cacheline as the completion path, so the check is cheap.

Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
setting tx-usecs to 0 disables coalescing and falls back to per-packet
completion.

   ethtool -C <veth-dev> tx-usecs 500  # 500us coalescing
   ethtool -C <veth-dev> tx-usecs 0    # per-packet (no coalescing)

Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Co-developed-by: Jonas Köppeler <redacted>
Signed-off-by: Jonas Köppeler <redacted>
Signed-off-by: Simon Schippers <redacted>
---
  drivers/net/veth.c | 123 ++++++++++++++++++++++++++++++++++++++++++---
  1 file changed, 117 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 2473f730734b..c62d87a8402c 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c

@@ -28,6 +28,7 @@
  #include <linux/bpf_trace.h>
  #include <linux/net_tstamp.h>
  #include <linux/skbuff_ref.h>
+#include <linux/sched/clock.h>
  #include <net/page_pool/helpers.h>
  
  #define DRV_NAME	"veth"

@@ -50,6 +51,7 @@
   * delay => 64 * 250 ms = 16 s.
   */
  #define VETH_WATCHDOG_TIMEOUT_MS	(64 * 250)
+#define VETH_BQL_COAL_TX_USECS	100 /* default tx-usecs for BQL batching*/
  
  struct veth_stats {
  	u64	rx_drops;

@@ -69,6 +71,11 @@ struct veth_rq_stats {
  	struct u64_stats_sync	syncp;
  };
  
+struct veth_bql_state {
+	u64	time;	/* sched_clock() when current coalescing window started */
+	uint	n_bql;	/* BQL completions batched in the current window */
+};
+
  struct veth_rq {
  	struct napi_struct	xdp_napi;
  	struct napi_struct __rcu *napi; /* points to xdp_napi when the latteris initialized */

@@ -76,6 +83,7 @@ struct veth_rq {
  	struct bpf_prog __rcu	*xdp_prog;
  	struct xdp_mem_info	xdp_mem;
  	struct veth_rq_stats	stats;
+	struct veth_bql_state	bql_state;
  	bool			rx_notify_masked;
  	struct ptr_ring		xdp_ring;
  	struct xdp_rxq_info	xdp_rxq;

@@ -88,6 +96,7 @@ struct veth_priv {
  	struct bpf_prog		*_xdp_prog;
  	struct veth_rq		*rq;
  	unsigned int		requested_headroom;
+	unsigned int		tx_coal_usecs;	/* BQL completion coalescing */
  };
  
  struct veth_xdp_tx_bq {

@@ -272,7 +281,56 @@ static void veth_get_channels(struct net_device *dev,
  static int veth_set_channels(struct net_device *dev,
  			     struct ethtool_channels *ch);
  
+static int veth_get_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+
+	ec->tx_coalesce_usecs = priv->tx_coal_usecs;
+	return 0;
+}
+
+static int veth_set_coalesce(struct net_device *dev,
+			     struct ethtool_coalesce *ec,
+			     struct kernel_ethtool_coalesce *kernel_coal,
+			     struct netlink_ext_ack *extack)
+{
+	struct veth_priv *priv = netdev_priv(dev);
+	struct net_device *peer;
+
+	/* The coalescing window delays BQL completions, so keep tx-usecs well
+	 * below the tx_timeout watchdog; otherwise a large value could stall a
+	 * stopped queue long enough to trip a false watchdog timeout. Cap at
+	 * half the watchdog to leave a generous safety margin. tx-usecs is
+	 * microseconds, the watchdog is milliseconds.
+	 */
+	if (ec->tx_coalesce_usecs > VETH_WATCHDOG_TIMEOUT_MS / 2 * USEC_PER_MSEC) {
+		NL_SET_ERR_MSG_MOD(extack,
+				   "tx-usecs must stay below half the tx_timeout watchdog");
+		return -ERANGE;
+	}
+
+	/* Paired with READ_ONCE in veth_xdp_rcv(). */
+	WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
+
+	/* veth_xdp_rcv() reads each device's own value, so mirror it onto
+	 * the peer to keep the pair symmetric: both directions coalesce
+	 * with the same tx-usecs. Called under RTNL, rtnl_dereference() is safe.
+	 */
+	peer = rtnl_dereference(priv->peer);
+	if (peer) {
+		struct veth_priv *peer_priv = netdev_priv(peer);
+
+		WRITE_ONCE(peer_priv->tx_coal_usecs, ec->tx_coalesce_usecs);
+	}
+
+	return 0;
+}
+
  static const struct ethtool_ops veth_ethtool_ops = {
+	.supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
  	.get_drvinfo		= veth_get_drvinfo,
  	.get_link		= ethtool_op_get_link,
  	.get_strings		= veth_get_strings,

@@ -282,6 +340,8 @@ static const struct ethtool_ops veth_ethtool_ops ={
  	.get_ts_info		= ethtool_op_get_ts_info,
  	.get_channels		= veth_get_channels,
  	.set_channels		= veth_set_channels,
+	.get_coalesce		= veth_get_coalesce,
+	.set_coalesce		= veth_set_coalesce,
  };
  
  /* general routines */

@@ -969,13 +1029,54 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
  	return NULL;
  }
  
+static void veth_bql_maybe_complete(struct veth_bql_state *state,
+				    struct netdev_queue *peer_txq,
+				    u64 bql_flush_ns)
+{
+	u64 current_time;
+
+	/* There is no reason to complete with 0 and
+	 * peer_txq could go away.
+	 */
+	if (!state->n_bql || !peer_txq)
+		return;
+
+	current_time = sched_clock();
+
+	/* We complete if:
+	 * 1. We reach bql_flush_ns.
+	 * 2. We potentially have BQL starvation.
+	 */
+	if (state->time + bql_flush_ns <= current_time ||
+	    state->n_bql > peer_txq->dql.limit) {

Indeed, this does not compile when CONFIG_BQL is not set. I think we 
should just bring back the 'queue is empty + queue is stopped' check 
from v6 back at the end of the poll and remove the n_bql > dql.limit 
check. It also feels not obvious why this is handling the starvation 
case. This only works, because the producer has went overlimit 
previously and was stopped. So more than 'limit' packets have been 
enqueued to the ring, and they are eventually drained when this check is 
true. By removing this we can also avoid accessing dql internal members, 
but if you don't think that's a problem we can leave as is.

Further, this is only works if VETH_BQL_UNIT stays 1, otherwise it will 
never fire. Anyway, still its necessary to check for CONFIG_BQL. But we 
could solve this by adding VETH_BQL_UNIT to n_bql instead of 1. This is 
also safe from any overflows, since limit is bound to limit_max, 
inflight is always less than limit + 1*VETH_BQL_UNIT and n_bql <= inflight.

In a version of bringing back the 'queue-empty' check and keeping most 
of the current logic (so a mixture of v6 and v7) resulted in the same 
performance on an x86_64 architecture.

Both Sashiko-Nipa and Sashiko-Gemini are right, this is missing a
#ifdef CONFIG_BQL. Not sure what is the best way to add them.
And for the struct we could maybe do:

#ifdef CONFIG_BQL
struct veth_bql_state {
     u64	time;	/* sched_clock() when current coalescing window started */
     uint	n_bql;	/* BQL completions batched in the current window */
};
#else
struct veth_bql_state {};
#endif

Regarding the configs: we can just do something along those lines.
struct veth_rq {
...
#ifdef CONFIG_BQL
	struct veth_bql_state		dql;
#endif
...
}

and we put the rest of the code that accesses or performs an action 
regarding bql in some functions and do it like in netdev_* functions with

Function-Signature()
{
#ifdef CONFIG_BQL
// Code
#endif
}

Wdyt?
- Jonas

quoted

+		netdev_tx_completed_queue(peer_txq, state->n_bql,
+					  state->n_bql * VETH_BQL_UNIT);
+		state->time = current_time;
+		state->n_bql = 0;
+	}
+}
+
  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
  			struct veth_xdp_tx_bq *bq,
  			struct veth_stats *stats,
  			struct netdev_queue *peer_txq)
  {
+	struct veth_priv *priv = netdev_priv(rq->dev);
+	struct veth_bql_state *state = &rq->bql_state;
  	int i, done = 0, n_xdpf = 0;
  	void *xdpf[VETH_XDP_BATCH];
+	u64 bql_flush_ns;
+
+	/* Mirrored to both peers; paired with WRITE_ONCE() in veth_set_coalesce */
+	bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
+
+	/* Clamp stored timestamp in case we migrated to a CPU with a behind
+	 * sched_clock(); tries to reduce late BQL flushes.
+	 */
+	state->time = min(state->time, sched_clock());
+
+	/* Flush completions that timed out since the previous NAPI poll. */
+	veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);>>
  	for (i = 0; i < budget; i++) {
  		void *ptr = __ptr_ring_consume(&rq->xdp_ring);

@@ -1000,12 +1101,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
  			}
  		} else {
  			/* ndo_start_xmit */
-			bool bql_charged = veth_ptr_is_bql(ptr);
  			struct sk_buff *skb = veth_ptr_to_skb(ptr);
  
+			if (veth_ptr_is_bql(ptr))
+				state->n_bql++;
  			stats->xdp_bytes += skb->len;
-			if (peer_txq && bql_charged)
-				netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
  
  			skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
  			if (skb) {
@@ -1015,6 +1115,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
  					napi_gro_receive(&rq->xdp_napi, skb);
  			}
  		}
+		veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
  		done++;

Sashiko-Nipa reports:

"If veth_xdp_rcv() finishes and returns a done count less than the budget,
NAPI will go to sleep in veth_poll(). Do we need to unconditionally flush
any stranded BQL completions in veth_poll() before sleeping?
If completions are left in rq->bql_state indefinitely across NAPI idle
periods, it might present an artificially massive delay to DQL. This could
cause DQL to mistakenly conclude the hardware is extremely slow and
aggressively shrink dql.limit to its minimum, crippling throughput on
subsequent bursts."

Again the issue that I found to be non-problematic in [1] and can be
seen by an BQL inflight > 0 when for example pktgen suddenly stops.

If we would "unconditionally flush any stranded BQL completions in
veth_poll() before sleeping" we would *not* accumulate BQL completions
across NAPI polls but we want to do that.

Do you agree?

[1] https://lore.kernel.org/netdev/c8650d3a-e488-4279-b28f-549d766c23a1@tu-dortmund.de/ (local)

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help