[PATCH net-next v5] net: reduce RFS/ARFS flow updates by checking LLC affinity
From: Chuang Wang <hidden>
Date: 2026-05-25 02:59:07
Also in:
lkml
Subsystem:
networking [general], the rest · Maintainers:
"David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
The current implementation of rps_record_sock_flow() updates the flow
table every time a socket is processed on a different CPU. In high-load
scenarios, especially with Accelerated RFS (ARFS), this triggers
frequent flow steering updates via ndo_rx_flow_steer.
For drivers like mlx5 that implement hardware flow steering, these
constant updates lead to significant contention on internal driver locks
(e.g., arfs_lock). This contention often becomes a performance
bottleneck that outweighs the steering benefits.
This patch introduces a cache-aware update strategy: the flow record is
only updated if the flow migrates across Last Level Cache (LLC)
boundaries. This minimizes expensive hardware reconfigurations while
preserving cache locality for the application. A new sysctl,
net.core.rps_feat_llc_affinity, is added to toggle this feature.
Additionally, export sock_rps_record_flow_hash() and
sock_rps_record_flow(). This resolves a symbol visibility compilation
error triggered by 'tun' using sock_rps_record_flow_hash() in
tun_flow_update() when CONFIG_TUN is built as a module. The same logic
is applied to SCTP, allowing it to use sock_rps_record_flow() safely
when built as a module.
Performance Test Results:
The patch was tested in a K8s environment (AMD CPU 128*2, 16-core Pod
with CPU pinning, mlx5 NIC) using brpc[1] echo_server and rpc_press.
rpc_press Commands:
for i in {1..8}; do
./rpc_press -proto=./echo.proto -method=example.EchoService.Echo
-server=<IP>:8000 -input='{"message":"hello"}'
-qps=0 -thread_num=512 -connection_type=pooled &
done
Monitor mlx5e_rx_flow_steer frequency:
/usr/share/bcc/tools/funccount -i 1 mlx5e_rx_flow_steer
Frequency of mlx5e_rx_flow_steer (via funccount[2]):
Before: ~335,000 counts/sec
After: ~23,000 counts/sec (reduced by ~93%)
System Metrics (after enabling rps_feat_llc_affinity):
CPU Utilization: 38% -> 32%
CPU PSI (Pressure Stall Information): 20% -> 10%
These results demonstrate that filtering updates by LLC affinity
significantly reduces driver lock contention and improves overall
CPU efficiency under heavy network load.
[1] https://github.com/apache/brpc/
[2] https://github.com/iovisor/bcc/blob/master/tools/funccount.py
Signed-off-by: Chuang Wang <redacted>
---
v4 -> v5: fix 'modpost: "rps_llc_check" [net/sctp/sctp.ko] undefined!' by kernel test robot
v3 -> v4: add rps_llc_check by Eric Dumazet
v2 -> v3: patch net -> net-next by Jakub Kicinski
v1 -> v2: add rps_feat_llc_affinity; add brpc tests
include/net/rps.h | 28 ++++----------
net/core/dev.c | 76 ++++++++++++++++++++++++++++++++++++++
net/core/sysctl_net_core.c | 35 ++++++++++++++++++
3 files changed, 119 insertions(+), 20 deletions(-)
diff --git a/include/net/rps.h b/include/net/rps.h
index e33c6a2fa8bb..6dacf0888a6c 100644
--- a/include/net/rps.h
+++ b/include/net/rps.h@@ -12,6 +12,7 @@ extern struct static_key_false rps_needed; extern struct static_key_false rfs_needed; +extern struct static_key_false rps_feat_llc_affinity; /* * This structure holds an RPS map which can be of variable length. The
@@ -55,11 +56,14 @@ struct rps_sock_flow_table { #define RPS_NO_CPU 0xffff +bool rps_llc_check(u32 old_val, u32 new_val); + static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) { unsigned int index = hash & rps_tag_to_mask(tag_ptr); u32 val = hash & ~net_hotdata.rps_cpu_mask; struct rps_sock_flow_table *table; + u32 old_val; /* We only give a hint, preemption can change CPU under us */ val |= raw_smp_processor_id();
@@ -68,7 +72,8 @@ static inline void rps_record_sock_flow(rps_tag_ptr tag_ptr, u32 hash) /* The following WRITE_ONCE() is paired with the READ_ONCE() * here, and another one in get_rps_cpu(). */ - if (READ_ONCE(table[index].ent) != val) + old_val = READ_ONCE(table[index].ent); + if (old_val != val && rps_llc_check(old_val, val)) WRITE_ONCE(table[index].ent, val); }
@@ -136,25 +141,8 @@ static inline bool rfs_is_needed(void) #endif } -static inline void sock_rps_record_flow_hash(__u32 hash) -{ -#ifdef CONFIG_RPS - if (!rfs_is_needed()) - return; - - _sock_rps_record_flow_hash(hash); -#endif -} - -static inline void sock_rps_record_flow(const struct sock *sk) -{ -#ifdef CONFIG_RPS - if (!rfs_is_needed()) - return; - - _sock_rps_record_flow(sk); -#endif -} +void sock_rps_record_flow_hash(__u32 hash); +void sock_rps_record_flow(const struct sock *sk); static inline void sock_rps_delete_flow(const struct sock *sk) {
diff --git a/net/core/dev.c b/net/core/dev.c
index 26ac8eb9b259..f98216ddaec1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c@@ -4997,6 +4997,8 @@ struct static_key_false rps_needed __read_mostly; EXPORT_SYMBOL(rps_needed); struct static_key_false rfs_needed __read_mostly; EXPORT_SYMBOL(rfs_needed); +struct static_key_false rps_feat_llc_affinity __read_mostly; +EXPORT_SYMBOL(rps_feat_llc_affinity); static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr) {
@@ -5208,6 +5210,58 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, return cpu; } +/** + * rps_llc_check - Determine if RPS flow table should be updated + * @old_val: Previous flow record value + * @new_val: Target flow record value + * + * Returns true if the record needs an update. + */ +bool rps_llc_check(u32 old_val, u32 new_val) +{ + u32 old_cpu = old_val & ~net_hotdata.rps_cpu_mask; + u32 new_cpu = new_val & ~net_hotdata.rps_cpu_mask; + + if (old_val == new_val) + return false; + + /* + * RPS LLC Affinity Feature: + * Reduce RFS/ARFS flow updates by checking LLC affinity. + * + * Frequent flow table updates can trigger constant hardware steering + * reconfigurations (e.g., ndo_rx_flow_steer), leading to significant + * contention on driver internal locks (like mlx5's arfs_lock). + * + * This strategy only updates the flow record if it migrates across LLC + * boundaries. This minimizes expensive hardware updates while preserving + * cache locality for the application. + */ + if (static_branch_unlikely(&rps_feat_llc_affinity)) { + /* Force update if the recorded CPU is invalid or has gone offline */ + if (old_cpu >= nr_cpu_ids || !cpu_active(old_cpu)) + return true; + + /* + * Force an update if the current task is no longer permitted + * to run on the old_cpu. + */ + if (!cpumask_test_cpu(old_cpu, current->cpus_ptr)) + return true; + + /* + * If CPUs do not share a cache, allow the update to prevent + * expensive remote memory accesses and cache misses. + */ + if (!cpus_share_cache(old_cpu, new_cpu)) + return true; + + return false; + } + + return true; +} + #ifdef CONFIG_RFS_ACCEL /**
@@ -5249,6 +5303,28 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index, } EXPORT_SYMBOL(rps_may_expire_flow); +void sock_rps_record_flow_hash(__u32 hash) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow_hash(hash); +#endif +} +EXPORT_SYMBOL(sock_rps_record_flow_hash); + +void sock_rps_record_flow(const struct sock *sk) +{ +#ifdef CONFIG_RPS + if (!rfs_is_needed()) + return; + + _sock_rps_record_flow(sk); +#endif +} +EXPORT_SYMBOL(sock_rps_record_flow); + #endif /* CONFIG_RFS_ACCEL */ /* Called from hardirq (IPI) context */
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index b508618bfc12..b6d4ebcbb6a6 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c@@ -210,6 +210,33 @@ static int rps_sock_flow_sysctl(const struct ctl_table *table, int write, kvfree_rcu_mightsleep(tofree); return ret; } + +static int rps_feat_llc_affinity_sysctl(const struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + u8 curr_state; + int ret; + const struct ctl_table tmp = { + .data = &curr_state, + .maxlen = sizeof(curr_state), + .mode = table->mode, + .extra1 = table->extra1, + .extra2 = table->extra2 + }; + + curr_state = static_branch_unlikely(&rps_feat_llc_affinity) ? 1 : 0; + + ret = proc_dou8vec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && ret == 0) { + if (curr_state && !static_branch_unlikely(&rps_feat_llc_affinity)) + static_branch_enable(&rps_feat_llc_affinity); + else if (!curr_state && static_branch_unlikely(&rps_feat_llc_affinity)) + static_branch_disable(&rps_feat_llc_affinity); + } + + return ret; +} + #endif /* CONFIG_RPS */ #ifdef CONFIG_NET_FLOW_LIMIT
@@ -554,6 +581,14 @@ static struct ctl_table net_core_table[] = { .mode = 0644, .proc_handler = rps_sock_flow_sysctl }, + { + .procname = "rps_feat_llc_affinity", + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = rps_feat_llc_affinity_sysctl, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE + }, #endif #ifdef CONFIG_NET_FLOW_LIMIT {
--
2.47.3