[PATCH v1 net-next 06/14] net: Add per-netns netdev unregistration infra.
From: Kuniyuki Iwashima <kuniyu@google.com>
Date: 2026-07-01 21:43:48
Subsystem:
networking drivers, networking [general], the rest · Maintainers:
Andrew Lunn, "David S. Miller", Eric Dumazet, Jakub Kicinski, Paolo Abeni, Linus Torvalds
When we need to unregister a netdev in a different netns, we will
delegate its unregistration to per-netns work.
There are three types of such cross-netns devices:
1. Paired devices (e.g., netkit, veth, vxcan)
-> Unregistering one device also deletes its peer, which
may reside in another netns.
2. Tunnel devices (e.g., bareudp, geneve, etc)
-> Destroying a netns removes devices in another netns if
their backend sockets reside in the dying netns
3. Stacked devices (e.g., ipvlan, macvlan, etc)
-> Removing the lower device also removes multiple upper
devices, each of which may reside in different namespaces.
In these cases, we will use unregister_netdevice_queue_net() to
queue such potential cross-netns devices for destruction.
unregister_netdevice_queue_net() takes net and dev. If dev resides
in the net, it simply calls unregister_netdevice_queue().
If dev_net(dev) is different from the net, it enqueues the device
to dev_net(dev)->dev_unreg_head and schedules the per-netns work.
When __rtnl_net_unlock() is called from the per-netns work (or another
thread already holding the lock), unregister_netdevice_many_net()
collects the queued devices and calls unregister_netdevice_many()
to perform the actual unregistration.
During netns dismantle, rtnl_net_flush_workqueue() is called at the
end of default_device_exit_batch() to ensure that cross-netns
devices in the other alive netns are unregistered.
Once RTNL is removed, a device could be moved to another netns while
being queued to net->dev_unreg_head.
__dev_change_net_namespace() handles this race by acquiring
net->dev_unreg_lock of both the old and new netns after dev_set_net()
and moving the device between their dev_unreg_head lists.
Since dev_set_net() and unregister_netdevice_queue_net() are
synchronised by netdev_lock(), the device is either queued to the
old netns's dev_unreg_head and then moved, or queued directly to
the new netns.
Note that unregister_netdevice_move_net() does not need to call
rtnl_net_queue_work() because __dev_change_net_namespace() is
(supposed to be) called with rtnl_net_lock(). (Not all callers
hold it yet, but the race does not happen until all callers
are converted and RTNL is removed.)
Signed-off-by: Kuniyuki Iwashima <kuniyu@google.com>
---
include/linux/netdevice.h | 16 +++++++
include/net/net_namespace.h | 2 +
net/core/dev.c | 85 +++++++++++++++++++++++++++++++++++++
net/core/net_namespace.c | 2 +
net/core/rtnetlink.c | 4 ++
5 files changed, 109 insertions(+)
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 9981d637f8b5..53454db3611a 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h@@ -2241,6 +2241,9 @@ struct net_device { struct list_head dev_list; struct list_head napi_list; struct list_head unreg_list; +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + struct list_head unreg_list_net; +#endif struct list_head close_list; struct list_head ptype_all;
@@ -3472,6 +3475,19 @@ static inline void unregister_netdevice(struct net_device *dev) unregister_netdevice_queue(dev, NULL); } +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void unregister_netdevice_queue_net(struct net *net, struct net_device *dev, + struct list_head *head); +void unregister_netdevice_many_net(struct net *net); +#else +static inline void unregister_netdevice_queue_net(struct net *net, + struct net_device *dev, + struct list_head *head) +{ + unregister_netdevice_queue(dev, head); +} +#endif + int netdev_refcnt_read(const struct net_device *dev); void free_netdev(struct net_device *dev);
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index a989019af5f7..501af1999fe8 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h@@ -198,6 +198,8 @@ struct net { /* Move to a better place when the config guard is removed. */ struct mutex rtnl_mutex; struct work_struct rtnl_work; + struct list_head dev_unreg_head; + spinlock_t dev_unreg_lock; #endif #if IS_ENABLED(CONFIG_VSOCKETS) struct netns_vsock vsock;
diff --git a/net/core/dev.c b/net/core/dev.c
index 48818a194fa5..0f0bf65f5bf9 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c@@ -12092,6 +12092,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, INIT_LIST_HEAD(&dev->napi_list); INIT_LIST_HEAD(&dev->unreg_list); +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + INIT_LIST_HEAD(&dev->unreg_list_net); +#endif INIT_LIST_HEAD(&dev->close_list); INIT_LIST_HEAD(&dev->link_watch_list); INIT_LIST_HEAD(&dev->adj_list.upper);
@@ -12485,6 +12488,16 @@ void unregister_netdevice_many_notify(struct list_head *head, synchronize_net(); list_for_each_entry(dev, head, unreg_list) { +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + struct net *net = dev_net(dev); + + /* spin_lock() can be moved outside of the loop + * once the per-netns RTNL conversion completes. + */ + spin_lock(&net->dev_unreg_lock); + list_del(&dev->unreg_list_net); + spin_unlock(&net->dev_unreg_lock); +#endif netdev_put(dev, &dev->dev_registered_tracker); net_set_todo(dev); cnt++;
@@ -12507,6 +12520,72 @@ void unregister_netdevice_many(struct list_head *head) } EXPORT_SYMBOL(unregister_netdevice_many); +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL +void unregister_netdevice_queue_net(struct net *net, struct net_device *dev, + struct list_head *head) +{ + netdev_lock(dev); + + if (net_eq(dev_net(dev), net)) { + netdev_unlock(dev); + unregister_netdevice_queue(dev, head); + return; + } + + net = dev_net(dev); + + spin_lock(&net->dev_unreg_lock); + + DEBUG_NET_WARN_ON_ONCE(!list_empty(&dev->unreg_list_net)); + list_add_tail(&dev->unreg_list_net, &net->dev_unreg_head); + rtnl_net_queue_work(net); + + spin_unlock(&net->dev_unreg_lock); + + netdev_unlock(dev); +} +EXPORT_SYMBOL(unregister_netdevice_queue_net); + +static void unregister_netdevice_move_net(struct net *net_old, + struct net *net, + struct net_device *dev) +{ + if (net_old > net) { + spin_lock(&net->dev_unreg_lock); + spin_lock(&net_old->dev_unreg_lock); + } else { + spin_lock(&net_old->dev_unreg_lock); + spin_lock(&net->dev_unreg_lock); + } + + if (!list_empty(&dev->unreg_list_net)) { + list_del(&dev->unreg_list_net); + list_add_tail(&dev->unreg_list_net, &net->dev_unreg_head); + } + + spin_unlock(&net_old->dev_unreg_lock); + spin_unlock(&net->dev_unreg_lock); +} + +void unregister_netdevice_many_net(struct net *net) +{ + struct net_device *dev, *tmp; + LIST_HEAD(unreg_head_net); + LIST_HEAD(unreg_head); + + spin_lock(&net->dev_unreg_lock); + list_splice_init(&net->dev_unreg_head, &unreg_head_net); + spin_unlock(&net->dev_unreg_lock); + + list_for_each_entry_safe(dev, tmp, &unreg_head_net, unreg_list_net) { + list_del_init(&dev->unreg_list_net); + list_add_tail(&dev->unreg_list, &unreg_head); + } + + unregister_netdevice_many(&unreg_head); +} +#endif + /** * unregister_netdev - remove device from the kernel * @dev: device
@@ -12663,6 +12742,10 @@ int __dev_change_net_namespace(struct net_device *dev, struct net *net, netdev_unlock(dev); dev->ifindex = new_ifindex; +#ifdef CONFIG_DEBUG_NET_SMALL_RTNL + unregister_netdevice_move_net(net_old, net, dev); +#endif + if (new_name[0]) { /* Rename the netdev to prepared name */ write_seqlock_bh(&netdev_rename_lock);
@@ -13105,6 +13188,8 @@ static void __net_exit default_device_exit_batch(struct list_head *net_list) } unregister_netdevice_many(&dev_kill_list); rtnl_unlock(); + + rtnl_net_flush_workqueue(); } static struct pernet_operations __net_initdata default_device_ops = {
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index d1aeff9de580..578b48cf5318 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c@@ -423,6 +423,8 @@ static __net_init int preinit_net(struct net *net, struct user_namespace *user_n mutex_init(&net->rtnl_mutex); lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL); INIT_WORK(&net->rtnl_work, rtnl_net_work_func); + INIT_LIST_HEAD(&net->dev_unreg_head); + spin_lock_init(&net->dev_unreg_lock); #endif INIT_LIST_HEAD(&net->ptype_all);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 7959519e7375..544498d3c325 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c@@ -197,6 +197,7 @@ void __rtnl_net_unlock(struct net *net) { ASSERT_RTNL(); + unregister_netdevice_many_net(net); mutex_unlock(&net->rtnl_mutex); } EXPORT_SYMBOL(__rtnl_net_unlock);
@@ -290,6 +291,9 @@ void rtnl_net_work_func(struct work_struct *work) { struct net *net = container_of(work, struct net, rtnl_work); + if (list_empty(&net->dev_unreg_head)) + return; + rtnl_net_lock(net); rtnl_net_unlock(net); }
--
2.55.0.rc0.799.gd6f94ed593-goog