[PATCH net 1/3 v2] net: Extend bpf_net_context lifetime to cover qdisc enqueue
From: Jamal Hadi Salim <jhs@mojatatu.com>
Date: 2026-06-29 10:22:14
Also in:
bpf, linux-rt-devel, stable
Subsystem:
networking [general], the rest · Maintainers:
"David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
The bpf_net_context used by sch_handle_egress() is stack-allocated and torn
down in that function returned. By the time tcf_qevent_handle() runs
current->bpf_net_context is NULL.
When a filter attached to a qevent block (e.g. RED's early_drop or mark
qevents, which always use shared blocks) returns TC_ACT_REDIRECT,
tcf_qevent_handle() calls skb_do_redirect(), which in turn calls bpf helper
bpf_net_ctx_get_ri(). That helper unconditionally dereferences
current->bpf_net_context resulting in a NULL pointer dereference.
Note: The same holds for actions that invoke BPF redirect helpers
(e.g. act_bpf running a program that calls bpf_redirect()) during qevent
classification itself.
Fix:
Move the bpf_net_context lifecycle out of sch_handle_egress() into
__dev_queue_xmit(), so that it spans both the egress TC fast path and the
qdisc enqueue.
Note: The call is placed outside the egress_needed_key static branch
to cover the case where clsact static key is disabled. Unfortunately this
adds a small unconditional penalty to the code path _per packet_ only
guarded by CONFIG_NET_XGRESS (two writes and one read).
As pointed by sashiko [1]:
The same context must also be set up in net_tx_action()'s qdisc drain
path, since qdisc_run() -> netem_dequeue() -> qdisc_enqueue( RED child)
can trigger qevent classification asynchronously from softirq context.
This keeps all bpf_net_context management in net/core/dev.c i.e the
existing boundary between tc core and BPF without requiring any net/sched/
code to know about BPF plumbing.
Reproducer:
tc qdisc add dev eth0 root handle 1: red limit 1MB min 10KB max 20KB \
avpkt 1000 burst 100 qevent early_drop block 10
tc filter add block 10 pref 1 bpf obj redirect.o
traffic through eth0 triggers red_enqueue() -> tcf_qevent_handle() and,
on a redirect verdict, a NULL deref in skb_do_redirect().
Fixes: 3625750f05ec ("net: sched: Introduce helpers for qevent blocks")
Tested-by: Victor Nogueira <redacted>
Signed-off-by: Jamal Hadi Salim <jhs@mojatatu.com>
---
net/core/dev.c | 31 +++++++++++++++++++++++--------
1 file changed, 23 insertions(+), 8 deletions(-)
diff --git a/net/core/dev.c b/net/core/dev.c
index 4b3d5cfdf6e0..b95a8b153c76 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c@@ -4527,14 +4527,11 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) { struct bpf_mprog_entry *entry = rcu_dereference_bh(dev->tcx_egress); enum skb_drop_reason drop_reason = SKB_DROP_REASON_TC_EGRESS; - struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; int sch_ret; if (!entry) return skb; - bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); - /* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was * already set by the caller. */
@@ -4550,12 +4547,10 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) /* No need to push/pop skb's mac_header here on egress! */ skb_do_redirect(skb); *ret = NET_XMIT_SUCCESS; - bpf_net_ctx_clear(bpf_net_ctx); return NULL; case TC_ACT_SHOT: kfree_skb_reason(skb, drop_reason); *ret = NET_XMIT_DROP; - bpf_net_ctx_clear(bpf_net_ctx); return NULL; /* used by tc_run */ case TC_ACT_STOLEN:
@@ -4565,10 +4560,8 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) fallthrough; case TC_ACT_CONSUMED: *ret = NET_XMIT_SUCCESS; - bpf_net_ctx_clear(bpf_net_ctx); return NULL; } - bpf_net_ctx_clear(bpf_net_ctx); return skb; }
@@ -4767,6 +4760,9 @@ struct netdev_queue *netdev_core_pick_tx(struct net_device *dev, */ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) { +#ifdef CONFIG_NET_XGRESS + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx = NULL; +#endif struct net_device *dev = skb->dev; struct netdev_queue *txq = NULL; enum skb_drop_reason reason;
@@ -4795,6 +4791,9 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) skb_update_prio(skb); tcx_set_ingress(skb, false); +#ifdef CONFIG_NET_XGRESS + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +#endif #ifdef CONFIG_NET_EGRESS if (static_branch_unlikely(&egress_needed_key)) { if (nf_hook_egress_active()) {
@@ -4898,12 +4897,18 @@ int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev) reason = SKB_DROP_REASON_RECURSION_LIMIT; drop: +#ifdef CONFIG_NET_XGRESS + bpf_net_ctx_clear(bpf_net_ctx); +#endif rcu_read_unlock_bh(); dev_core_stats_tx_dropped_inc(dev); kfree_skb_list_reason(skb, reason); return rc; out: +#ifdef CONFIG_NET_XGRESS + bpf_net_ctx_clear(bpf_net_ctx); +#endif rcu_read_unlock_bh(); return rc; }
@@ -5815,6 +5820,9 @@ static __latent_entropy void net_tx_action(void) if (sd->output_queue) { struct Qdisc *head; +#ifdef CONFIG_NET_XGRESS + struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx; +#endif local_irq_disable(); head = sd->output_queue;
@@ -5824,6 +5832,10 @@ static __latent_entropy void net_tx_action(void) rcu_read_lock(); +#ifdef CONFIG_NET_XGRESS + bpf_net_ctx = bpf_net_ctx_set(&__bpf_net_ctx); +#endif + while (head) { spinlock_t *root_lock = NULL; struct sk_buff *to_free;
@@ -5860,6 +5872,10 @@ static __latent_entropy void net_tx_action(void) tcf_kfree_skb_list(to_free, q, NULL, qdisc_dev(q)); } +#ifdef CONFIG_NET_XGRESS + bpf_net_ctx_clear(bpf_net_ctx); +#endif + rcu_read_unlock(); }
--
2.54.0