Thread (60 messages) 60 messages, 10 authors, 2024-01-20
STALE881d
Revisions (8)
  1. v1 current
  2. v2 [diff vs current]
  3. v3 [diff vs current]
  4. v5 [diff vs current]
  5. v6 [diff vs current]
  6. v7 [diff vs current]
  7. v8 [diff vs current]
  8. v9 [diff vs current]

[PATCH net-next 15/24] net: Use nested-BH locking for XDP redirect.

From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: 2023-12-15 17:10:40
Also in: bpf, lkml
Subsystem: bpf [general] (safe dynamic programs and tools), bpf [networking] (tcx & tc bpf, sock_addr), networking drivers, networking [general], tc subsystem, the rest, vmware vmxnet3 ethernet driver, xdp (express data path) · Maintainers: Alexei Starovoitov, Daniel Borkmann, Andrii Nakryiko, Eduard Zingerman, Kumar Kartikeya Dwivedi, Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Jamal Hadi Salim, Jiri Pirko, Linus Torvalds, Ronak Doshi, David S. Miller, Jesper Dangaard Brouer, John Fastabend

The per-CPU variables used during bpf_prog_run_xdp() invocation and
later during xdp_do_redirect() rely on disabled BH for their protection.
Without locking in local_bh_disable() on PREEMPT_RT these data structure
require explicit locking.

This is a follow-up on the previous change which introduced
bpf_run_lock.redirect_lock and uses it now within drivers.

The simple way is to acquire the lock before bpf_prog_run_xdp() is
invoked and hold it until the end of function.
This does not always work because some drivers (cpsw, atlantic) invoke
xdp_do_flush() in the same context.
Acquiring the lock in bpf_prog_run_xdp() and dropping in
xdp_do_redirect() (without touching drivers) does not work because not
all driver, which use bpf_prog_run_xdp(), do support XDP_REDIRECT (and
invoke xdp_do_redirect()).

Ideally the minimal locking scope would be bpf_prog_run_xdp() +
xdp_do_redirect() and everything else (error recovery, DMA unmapping,
free/ alloc of memory, …) would happen outside of the locked section.

Cc: Alexei Starovoitov <ast@kernel.org>
Cc: Andrii Nakryiko <andrii@kernel.org>
Cc: Cong Wang <redacted>
Cc: Hao Luo <redacted>
Cc: Jamal Hadi Salim <jhs@mojatatu.com>
Cc: Jesper Dangaard Brouer <hawk@kernel.org>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: Jiri Pirko <jiri@resnulli.us>
Cc: John Fastabend <john.fastabend@gmail.com>
Cc: KP Singh <kpsingh@kernel.org>
Cc: Martin KaFai Lau <martin.lau@linux.dev>
Cc: Ronak Doshi <redacted>
Cc: Song Liu <song@kernel.org>
Cc: Stanislav Fomichev <redacted>
Cc: VMware PV-Drivers Reviewers <redacted>
Cc: Yonghong Song <yonghong.song@linux.dev>
Cc: bpf@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
---
 drivers/net/vmxnet3/vmxnet3_xdp.c |  1 +
 kernel/bpf/cpumap.c               |  2 ++
 net/bpf/test_run.c                | 11 ++++++++---
 net/core/dev.c                    |  3 +++
 net/core/filter.c                 |  1 +
 net/core/lwt_bpf.c                |  2 ++
 net/sched/cls_api.c               |  2 ++
 7 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/drivers/net/vmxnet3/vmxnet3_xdp.c b/drivers/net/vmxnet3/vmxnet3_xdp.c
index 80ddaff759d47..18bce98fd2e31 100644
--- a/drivers/net/vmxnet3/vmxnet3_xdp.c
+++ b/drivers/net/vmxnet3/vmxnet3_xdp.c
@@ -257,6 +257,7 @@ vmxnet3_run_xdp(struct vmxnet3_rx_queue *rq, struct xdp_buff *xdp,
 	u32 act;
 
 	rq->stats.xdp_packets++;
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	act = bpf_prog_run_xdp(prog, xdp);
 	page = virt_to_page(xdp->data_hard_start);
 
diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c
index 8a0bb80fe48a3..c26d49bb78679 100644
--- a/kernel/bpf/cpumap.c
+++ b/kernel/bpf/cpumap.c
@@ -144,6 +144,7 @@ static void cpu_map_bpf_prog_run_skb(struct bpf_cpu_map_entry *rcpu,
 	int err;
 
 	list_for_each_entry_safe(skb, tmp, listp, list) {
+		guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 		act = bpf_prog_run_generic_xdp(skb, &xdp, rcpu->prog);
 		switch (act) {
 		case XDP_PASS:
@@ -182,6 +183,7 @@ static int cpu_map_bpf_prog_run_xdp(struct bpf_cpu_map_entry *rcpu,
 	struct xdp_buff xdp;
 	int i, nframes = 0;
 
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	xdp_set_return_frame_no_direct();
 	xdp.rxq = &rxq;
 
diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c
index c9fdcc5cdce10..db8f7eb35c6ca 100644
--- a/net/bpf/test_run.c
+++ b/net/bpf/test_run.c
@@ -293,6 +293,7 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 	batch_sz = min_t(u32, repeat, xdp->batch_size);
 
 	local_bh_disable();
+	local_lock_nested_bh(&bpf_run_lock.redirect_lock);
 	xdp_set_return_frame_no_direct();
 
 	for (i = 0; i < batch_sz; i++) {
@@ -348,6 +349,9 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 	}
 
 out:
+	xdp_clear_return_frame_no_direct();
+	local_unlock_nested_bh(&bpf_run_lock.redirect_lock);
+
 	if (redirect)
 		xdp_do_flush();
 	if (nframes) {
@@ -356,7 +360,6 @@ static int xdp_test_run_batch(struct xdp_test_data *xdp, struct bpf_prog *prog,
 			err = ret;
 	}
 
-	xdp_clear_return_frame_no_direct();
 	local_bh_enable();
 	return err;
 }
@@ -417,10 +420,12 @@ static int bpf_test_run(struct bpf_prog *prog, void *ctx, u32 repeat,
 	do {
 		run_ctx.prog_item = &item;
 		local_bh_disable();
-		if (xdp)
+		if (xdp) {
+			guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 			*retval = bpf_prog_run_xdp(prog, ctx);
-		else
+		} else {
 			*retval = bpf_prog_run(prog, ctx);
+		}
 		local_bh_enable();
 	} while (bpf_test_timer_continue(&t, 1, repeat, &ret, time));
 	bpf_reset_run_ctx(old_ctx);
diff --git a/net/core/dev.c b/net/core/dev.c
index 5a0f6da7b3ae5..5ba7509e88752 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -3993,6 +3993,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
 		*pt_prev = NULL;
 	}
 
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	qdisc_skb_cb(skb)->pkt_len = skb->len;
 	tcx_set_ingress(skb, true);
 
@@ -4045,6 +4046,7 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
 	if (!entry)
 		return skb;
 
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	/* qdisc_skb_cb(skb)->pkt_len & tcx_set_ingress() was
 	 * already set by the caller.
 	 */
@@ -5008,6 +5010,7 @@ int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
 		u32 act;
 		int err;
 
+		guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 		act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
 		if (act != XDP_PASS) {
 			switch (act) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 7c9653734fb60..72a7812f933a1 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -4241,6 +4241,7 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = {
  */
 void xdp_do_flush(void)
 {
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	__dev_flush();
 	__cpu_map_flush();
 	__xsk_map_flush();
diff --git a/net/core/lwt_bpf.c b/net/core/lwt_bpf.c
index a94943681e5aa..74b88e897a7e3 100644
--- a/net/core/lwt_bpf.c
+++ b/net/core/lwt_bpf.c
@@ -44,6 +44,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 	 * BPF prog and skb_do_redirect().
 	 */
 	local_bh_disable();
+	local_lock_nested_bh(&bpf_run_lock.redirect_lock);
 	bpf_compute_data_pointers(skb);
 	ret = bpf_prog_run_save_cb(lwt->prog, skb);
 
@@ -76,6 +77,7 @@ static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt,
 		break;
 	}
 
+	local_unlock_nested_bh(&bpf_run_lock.redirect_lock);
 	local_bh_enable();
 
 	return ret;
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 1976bd1639863..da61b99bc558f 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -23,6 +23,7 @@
 #include <linux/jhash.h>
 #include <linux/rculist.h>
 #include <linux/rhashtable.h>
+#include <linux/bpf.h>
 #include <net/net_namespace.h>
 #include <net/sock.h>
 #include <net/netlink.h>
@@ -3925,6 +3926,7 @@ struct sk_buff *tcf_qevent_handle(struct tcf_qevent *qe, struct Qdisc *sch, stru
 
 	fl = rcu_dereference_bh(qe->filter_chain);
 
+	guard(local_lock_nested_bh)(&bpf_run_lock.redirect_lock);
 	switch (tcf_classify(skb, NULL, fl, &cl_res, false)) {
 	case TC_ACT_SHOT:
 		qdisc_qstats_drop(sch);
-- 
2.43.0
Keyboard shortcuts
hback out one level
jnext message in thread
kprevious message in thread
ldrill in
Escclose help / fold thread tree
?toggle this help