Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via... | netdev

Re: [PATCH net-next v7 5/5] veth: time-based BQL completion coalescing via ethtool tx-usecs

From: Simon Schippers <hidden>
Date: 2026-06-30 19:07:31
Also in: bpf, lkml

On 6/30/26 16:00, Jonas Köppeler wrote:

On 6/13/26 4:14 PM, Simon Schippers wrote:

quoted

On 6/12/26 10:35, hawk@kernel.org wrote:

quoted

From: Simon Schippers <redacted>

Per-packet BQL completion forces DQL to converge on limit=2, causing
excessive NAPI scheduling overhead and qdisc requeues.

Accumulate BQL completions and flush them when a configurable time
threshold (tx-usecs) is exceeded, letting DQL discover a limit that
bounds actual queuing delay to the configured interval. Coalescing
state persists across NAPI polls in struct veth_rq so completions can
accumulate beyond a single budget=64 cycle.

The flush condition is:

state->time + bql_flush_ns <= current_time || state->n_bql > dql.limit

Flushing when n_bql exceeds dql.limit handles BQL starvation.

The comparison is strictly greater-than because netdev_tx_sent_queue()
always lets the producer exceed the limit by one before it stops, so
n_bql == dql.limit is a normal in-flight state. dql.limit lives in
the same cacheline as the completion path, so the check is cheap.

Add ethtool tx-usecs support for runtime tuning. Default is 100 us;
setting tx-usecs to 0 disables coalescing and falls back to per-packet
completion.

   ethtool -C <veth-dev> tx-usecs 500  # 500us coalescing
   ethtool -C <veth-dev> tx-usecs 0    # per-packet (no coalescing)

Co-developed-by: Jesper Dangaard Brouer <hawk@kernel.org>
Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org>
Co-developed-by: Jonas Köppeler <redacted>
Signed-off-by: Jonas Köppeler <redacted>
Signed-off-by: Simon Schippers <redacted>
---
  drivers/net/veth.c | 123 ++++++++++++++++++++++++++++++++++++++++++---
  1 file changed, 117 insertions(+), 6 deletions(-)

diff --git a/drivers/net/veth.c b/drivers/net/veth.c
index 2473f730734b..c62d87a8402c 100644
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c

@@ -28,6 +28,7 @@
  #include <linux/bpf_trace.h>
  #include <linux/net_tstamp.h>
  #include <linux/skbuff_ref.h>
+#include <linux/sched/clock.h>
  #include <net/page_pool/helpers.h>
    #define DRV_NAME    "veth"

@@ -50,6 +51,7 @@
   * delay => 64 * 250 ms = 16 s.
   */
  #define VETH_WATCHDOG_TIMEOUT_MS    (64 * 250)
+#define VETH_BQL_COAL_TX_USECS    100 /* default tx-usecs for BQL batching*/
    struct veth_stats {
      u64    rx_drops;

@@ -69,6 +71,11 @@ struct veth_rq_stats {
      struct u64_stats_sync    syncp;
  };
  +struct veth_bql_state {
+    u64    time;    /* sched_clock() when current coalescing window started */
+    uint    n_bql;    /* BQL completions batched in the current window */
+};
+
  struct veth_rq {
      struct napi_struct    xdp_napi;
      struct napi_struct __rcu *napi; /* points to xdp_napi when the latteris initialized */

@@ -76,6 +83,7 @@ struct veth_rq {
      struct bpf_prog __rcu    *xdp_prog;
      struct xdp_mem_info    xdp_mem;
      struct veth_rq_stats    stats;
+    struct veth_bql_state    bql_state;
      bool            rx_notify_masked;
      struct ptr_ring        xdp_ring;
      struct xdp_rxq_info    xdp_rxq;

@@ -88,6 +96,7 @@ struct veth_priv {
      struct bpf_prog        *_xdp_prog;
      struct veth_rq        *rq;
      unsigned int        requested_headroom;
+    unsigned int        tx_coal_usecs;    /* BQL completion coalescing */
  };
    struct veth_xdp_tx_bq {

@@ -272,7 +281,56 @@ static void veth_get_channels(struct net_device *dev,
  static int veth_set_channels(struct net_device *dev,
                   struct ethtool_channels *ch);
  +static int veth_get_coalesce(struct net_device *dev,
+                 struct ethtool_coalesce *ec,
+                 struct kernel_ethtool_coalesce *kernel_coal,
+                 struct netlink_ext_ack *extack)
+{
+    struct veth_priv *priv = netdev_priv(dev);
+
+    ec->tx_coalesce_usecs = priv->tx_coal_usecs;
+    return 0;
+}
+
+static int veth_set_coalesce(struct net_device *dev,
+                 struct ethtool_coalesce *ec,
+                 struct kernel_ethtool_coalesce *kernel_coal,
+                 struct netlink_ext_ack *extack)
+{
+    struct veth_priv *priv = netdev_priv(dev);
+    struct net_device *peer;
+
+    /* The coalescing window delays BQL completions, so keep tx-usecs well
+     * below the tx_timeout watchdog; otherwise a large value could stall a
+     * stopped queue long enough to trip a false watchdog timeout. Cap at
+     * half the watchdog to leave a generous safety margin. tx-usecs is
+     * microseconds, the watchdog is milliseconds.
+     */
+    if (ec->tx_coalesce_usecs > VETH_WATCHDOG_TIMEOUT_MS / 2 * USEC_PER_MSEC) {
+        NL_SET_ERR_MSG_MOD(extack,
+                   "tx-usecs must stay below half the tx_timeout watchdog");
+        return -ERANGE;
+    }
+
+    /* Paired with READ_ONCE in veth_xdp_rcv(). */
+    WRITE_ONCE(priv->tx_coal_usecs, ec->tx_coalesce_usecs);
+
+    /* veth_xdp_rcv() reads each device's own value, so mirror it onto
+     * the peer to keep the pair symmetric: both directions coalesce
+     * with the same tx-usecs. Called under RTNL, rtnl_dereference() is safe.
+     */
+    peer = rtnl_dereference(priv->peer);
+    if (peer) {
+        struct veth_priv *peer_priv = netdev_priv(peer);
+
+        WRITE_ONCE(peer_priv->tx_coal_usecs, ec->tx_coalesce_usecs);
+    }
+
+    return 0;
+}
+
  static const struct ethtool_ops veth_ethtool_ops = {
+    .supported_coalesce_params = ETHTOOL_COALESCE_TX_USECS,
      .get_drvinfo        = veth_get_drvinfo,
      .get_link        = ethtool_op_get_link,
      .get_strings        = veth_get_strings,

@@ -282,6 +340,8 @@ static const struct ethtool_ops veth_ethtool_ops ={
      .get_ts_info        = ethtool_op_get_ts_info,
      .get_channels        = veth_get_channels,
      .set_channels        = veth_set_channels,
+    .get_coalesce        = veth_get_coalesce,
+    .set_coalesce        = veth_set_coalesce,
  };
    /* general routines */

@@ -969,13 +1029,54 @@ static struct sk_buff *veth_xdp_rcv_skb(struct veth_rq *rq,
      return NULL;
  }
  +static void veth_bql_maybe_complete(struct veth_bql_state *state,
+                    struct netdev_queue *peer_txq,
+                    u64 bql_flush_ns)
+{
+    u64 current_time;
+
+    /* There is no reason to complete with 0 and
+     * peer_txq could go away.
+     */
+    if (!state->n_bql || !peer_txq)
+        return;
+
+    current_time = sched_clock();
+
+    /* We complete if:
+     * 1. We reach bql_flush_ns.
+     * 2. We potentially have BQL starvation.
+     */
+    if (state->time + bql_flush_ns <= current_time ||
+        state->n_bql > peer_txq->dql.limit) {

Indeed, this does not compile when CONFIG_BQL is not set. I think we should just bring back the 'queue is empty + queue is stopped' check from v6 back at the end of the poll and remove the n_bql > dql.limit check.

We would put #ifdef CONFIG_BQL around that logic aswell.

It also feels not obvious why this is handling the starvation case. This only works, because the producer has went overlimit previously and was stopped. So more than 'limit' packets have been enqueued to the ring, and they are eventually drained when this check is true.

I think it just needs some comment tweaking:

/* We complete if:
 * 1. We reach bql_flush_ns.
 * 2. We have BQL starvation. This means that the queue was over-limit
 *    in the last interval, and there is no more data in the queue,
 *    which is equivalent to we consumed more than limit items.
 */

By removing this we can also avoid accessing dql internal members, but if you don't think that's a problem we can leave as is.

I agree accessing dql internal variables is not perfect.

That is why I have locally implemented DQL for software interfaces in
a generic way inside dynamic_queue_limits.{h,c}.
I was able to squeeze the time and n_bql variables into the completion
cacheline of the dql struct by moving around variables.
The logic applies inside dql_completed() if enabled.
With this we just have to call netdev_completed_queue().
Also it allows for per-queue tweaking of tx_usecs via sysfs.
Works well for me, can share it if we want to use it.

Further, this is only works if VETH_BQL_UNIT stays 1, otherwise it will never fire. Anyway, still its necessary to check for CONFIG_BQL. But we could solve this by adding VETH_BQL_UNIT to n_bql instead of 1. This is also safe from any overflows, since limit is bound to limit_max, inflight is always less than limit + 1*VETH_BQL_UNIT and n_bql <= inflight.

You are right.

But I think there is no reason for VETH_BQL_UNIT anyway.
There should be no difference in the BQL algorithm, I personally
would replace VETH_BQL_UNIT with a hard-coded 1.

In a version of bringing back the 'queue-empty' check and keeping most of the current logic (so a mixture of v6 and v7) resulted in the same performance on an x86_64 architecture.

quoted

Both Sashiko-Nipa and Sashiko-Gemini are right, this is missing a
#ifdef CONFIG_BQL. Not sure what is the best way to add them.
And for the struct we could maybe do:

#ifdef CONFIG_BQL
struct veth_bql_state {
     u64    time;    /* sched_clock() when current coalescing window started */
     uint    n_bql;    /* BQL completions batched in the current window */
};
#else
struct veth_bql_state {};
#endif

Regarding the configs: we can just do something along those lines.
struct veth_rq {
...
#ifdef CONFIG_BQL
    struct veth_bql_state        dql;
#endif
...
}

and we put the rest of the code that accesses or performs an action regarding bql in some functions and do it like in netdev_* functions with

Function-Signature()
{
#ifdef CONFIG_BQL
// Code
#endif
}

Wdyt?
- Jonas

Yes, we have to. Unless we put it into dynamic_queue_limits.{h,c}
of course :^)

Thanks,
Simon

quoted

+        netdev_tx_completed_queue(peer_txq, state->n_bql,
+                      state->n_bql * VETH_BQL_UNIT);
+        state->time = current_time;
+        state->n_bql = 0;
+    }
+}
+
  static int veth_xdp_rcv(struct veth_rq *rq, int budget,
              struct veth_xdp_tx_bq *bq,
              struct veth_stats *stats,
              struct netdev_queue *peer_txq)
  {
+    struct veth_priv *priv = netdev_priv(rq->dev);
+    struct veth_bql_state *state = &rq->bql_state;
      int i, done = 0, n_xdpf = 0;
      void *xdpf[VETH_XDP_BATCH];
+    u64 bql_flush_ns;
+
+    /* Mirrored to both peers; paired with WRITE_ONCE() in veth_set_coalesce */
+    bql_flush_ns = (u64)READ_ONCE(priv->tx_coal_usecs) * 1000;
+
+    /* Clamp stored timestamp in case we migrated to a CPU with a behind
+     * sched_clock(); tries to reduce late BQL flushes.
+     */
+    state->time = min(state->time, sched_clock());
+
+    /* Flush completions that timed out since the previous NAPI poll. */
+    veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);>>
      for (i = 0; i < budget; i++) {
          void *ptr = __ptr_ring_consume(&rq->xdp_ring);

@@ -1000,12 +1101,11 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
              }
          } else {
              /* ndo_start_xmit */
-            bool bql_charged = veth_ptr_is_bql(ptr);
              struct sk_buff *skb = veth_ptr_to_skb(ptr);
  +            if (veth_ptr_is_bql(ptr))
+                state->n_bql++;
              stats->xdp_bytes += skb->len;
-            if (peer_txq && bql_charged)
-                netdev_tx_completed_queue(peer_txq, 1, VETH_BQL_UNIT);
                skb = veth_xdp_rcv_skb(rq, skb, bq, stats);
              if (skb) {
@@ -1015,6 +1115,7 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
                      napi_gro_receive(&rq->xdp_napi, skb);
              }
          }
+        veth_bql_maybe_complete(state, peer_txq, bql_flush_ns);
          done++;

Sashiko-Nipa reports:

"If veth_xdp_rcv() finishes and returns a done count less than the budget,
NAPI will go to sleep in veth_poll(). Do we need to unconditionally flush
any stranded BQL completions in veth_poll() before sleeping?
If completions are left in rq->bql_state indefinitely across NAPI idle
periods, it might present an artificially massive delay to DQL. This could
cause DQL to mistakenly conclude the hardware is extremely slow and
aggressively shrink dql.limit to its minimum, crippling throughput on
subsequent bursts."

Again the issue that I found to be non-problematic in [1] and can be
seen by an BQL inflight > 0 when for example pktgen suddenly stops.

If we would "unconditionally flush any stranded BQL completions in
veth_poll() before sleeping" we would *not* accumulate BQL completions
across NAPI polls but we want to do that.

Do you agree?

[1] https://lore.kernel.org/netdev/c8650d3a-e488-4279-b28f-549d766c23a1@tu-dortmund.de/ (local)

`h`	back out one level
`j`	next message in thread
`k`	previous message in thread
`l`	drill in
`Esc`	close help / fold thread tree
`?`	toggle this help