Merge branch 'rtnetlink-convert-rtnl_newlink-to-per-netns-rtnl'

Kuniyuki Iwashima says:

====================
rtnetlink: Convert rtnl_newlink() to per-netns RTNL.

Patch 1 - 3 removes __rtnl_link_unregister and protect link_ops by
its dedicated mutex to move synchronize_srcu() out of RTNL scope.

Patch 4 introduces struct rtnl_nets and helper functions to acquire
multiple per-netns RTNL in rtnl_newlink().

Patch 5 - 8 are to prefetch the peer device's netns in rtnl_newlink().

Patch 9 converts rtnl_newlink() to per-netns RTNL.

Patch 10 pushes RTNL down to rtnl_dellink() and rtnl_setlink(), but
the conversion will not be completed unless we support cases with
peer/upper/lower devices.

I confirmed v3 survived ./rtnetlink.sh; rmmod netdevsim.ko; without
lockdep splat.

v3: https://lore.kernel.org/20241107022900.70287-1-kuniyu@amazon.com
v2: https://lore.kernel.org/20241106022432.13065-1-kuniyu@amazon.com
v1: https://lore.kernel.org/20241105020514.41963-1-kuniyu@amazon.com
====================

Link: https://patch.msgid.link/20241108004823.29419-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
Jakub Kicinski
2024-11-11 17:26:54 -08:00
8 changed files with 218 additions and 125 deletions
+3 -9
View File
@@ -188,14 +188,10 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
/* register peer device */
if (data && data[VXCAN_INFO_PEER]) {
struct nlattr *nla_peer;
struct nlattr *nla_peer = data[VXCAN_INFO_PEER];
nla_peer = data[VXCAN_INFO_PEER];
ifmp = nla_data(nla_peer);
err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
if (err < 0)
return err;
rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
tbp = peer_tb;
}
@@ -208,9 +204,6 @@ static int vxcan_newlink(struct net *net, struct net_device *dev,
}
peer_net = rtnl_link_get_net(net, tbp);
if (IS_ERR(peer_net))
return PTR_ERR(peer_net);
peer = rtnl_create_link(peer_net, ifname, name_assign_type,
&vxcan_link_ops, tbp, extack);
if (IS_ERR(peer)) {
@@ -302,6 +295,7 @@ static struct rtnl_link_ops vxcan_link_ops = {
.newlink = vxcan_newlink,
.dellink = vxcan_dellink,
.policy = vxcan_policy,
.peer_type = VXCAN_INFO_PEER,
.maxtype = VXCAN_INFO_MAX,
.get_link_net = vxcan_get_link_net,
};
+8 -9
View File
@@ -168,22 +168,21 @@ static int __init dummy_init_module(void)
{
int i, err = 0;
down_write(&pernet_ops_rwsem);
rtnl_lock();
err = __rtnl_link_register(&dummy_link_ops);
err = rtnl_link_register(&dummy_link_ops);
if (err < 0)
goto out;
return err;
rtnl_net_lock(&init_net);
for (i = 0; i < numdummies && !err; i++) {
err = dummy_init_one();
cond_resched();
}
if (err < 0)
__rtnl_link_unregister(&dummy_link_ops);
out:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
rtnl_net_unlock(&init_net);
if (err < 0)
rtnl_link_unregister(&dummy_link_ops);
return err;
}
+8 -9
View File
@@ -426,22 +426,21 @@ static int __init ifb_init_module(void)
{
int i, err;
down_write(&pernet_ops_rwsem);
rtnl_lock();
err = __rtnl_link_register(&ifb_link_ops);
err = rtnl_link_register(&ifb_link_ops);
if (err < 0)
goto out;
return err;
rtnl_net_lock(&init_net);
for (i = 0; i < numifbs && !err; i++) {
err = ifb_init_one(i);
cond_resched();
}
if (err)
__rtnl_link_unregister(&ifb_link_ops);
out:
rtnl_unlock();
up_write(&pernet_ops_rwsem);
rtnl_net_unlock(&init_net);
if (err)
rtnl_link_unregister(&ifb_link_ops);
return err;
}
+2 -9
View File
@@ -351,12 +351,7 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
if (data[IFLA_NETKIT_PEER_INFO]) {
attr = data[IFLA_NETKIT_PEER_INFO];
ifmp = nla_data(attr);
err = rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
if (err < 0)
return err;
err = netkit_validate(peer_tb, NULL, extack);
if (err < 0)
return err;
rtnl_nla_parse_ifinfomsg(peer_tb, attr, extack);
tbp = peer_tb;
}
if (data[IFLA_NETKIT_SCRUB])
@@ -391,9 +386,6 @@ static int netkit_new_link(struct net *src_net, struct net_device *dev,
return -EOPNOTSUPP;
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
return PTR_ERR(net);
peer = rtnl_create_link(net, ifname, ifname_assign_type,
&netkit_link_ops, tbp, extack);
if (IS_ERR(peer)) {
@@ -978,6 +970,7 @@ static struct rtnl_link_ops netkit_link_ops = {
.fill_info = netkit_fill_info,
.policy = netkit_policy,
.validate = netkit_validate,
.peer_type = IFLA_NETKIT_PEER_INFO,
.maxtype = IFLA_NETKIT_MAX,
};
+4 -14
View File
@@ -1781,19 +1781,11 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
/*
* create and register peer first
*/
if (data != NULL && data[VETH_INFO_PEER] != NULL) {
struct nlattr *nla_peer;
if (data && data[VETH_INFO_PEER]) {
struct nlattr *nla_peer = data[VETH_INFO_PEER];
nla_peer = data[VETH_INFO_PEER];
ifmp = nla_data(nla_peer);
err = rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
if (err < 0)
return err;
err = veth_validate(peer_tb, NULL, extack);
if (err < 0)
return err;
rtnl_nla_parse_ifinfomsg(peer_tb, nla_peer, extack);
tbp = peer_tb;
} else {
ifmp = NULL;
@@ -1809,9 +1801,6 @@ static int veth_newlink(struct net *src_net, struct net_device *dev,
}
net = rtnl_link_get_net(src_net, tbp);
if (IS_ERR(net))
return PTR_ERR(net);
peer = rtnl_create_link(net, ifname, name_assign_type,
&veth_link_ops, tbp, extack);
if (IS_ERR(peer)) {
@@ -1952,6 +1941,7 @@ static struct rtnl_link_ops veth_link_ops = {
.newlink = veth_newlink,
.dellink = veth_dellink,
.policy = veth_policy,
.peer_type = VETH_INFO_PEER,
.maxtype = VETH_INFO_MAX,
.get_link_net = veth_get_link_net,
.get_num_tx_queues = veth_get_num_queues,
+4 -4
View File
@@ -13,6 +13,7 @@ typedef int (*rtnl_dumpit_func)(struct sk_buff *, struct netlink_callback *);
enum rtnl_link_flags {
RTNL_FLAG_DOIT_UNLOCKED = BIT(0),
#define RTNL_FLAG_DOIT_PERNET RTNL_FLAG_DOIT_UNLOCKED
#define RTNL_FLAG_DOIT_PERNET_WIP RTNL_FLAG_DOIT_UNLOCKED
RTNL_FLAG_BULK_DEL_SUPPORTED = BIT(1),
RTNL_FLAG_DUMP_UNLOCKED = BIT(2),
RTNL_FLAG_DUMP_SPLIT_NLM_DONE = BIT(3), /* legacy behavior */
@@ -71,10 +72,11 @@ static inline int rtnl_msg_family(const struct nlmsghdr *nlh)
/**
* struct rtnl_link_ops - rtnetlink link operations
*
* @list: Used internally, protected by RTNL and SRCU
* @list: Used internally, protected by link_ops_mutex and SRCU
* @srcu: Used internally
* @kind: Identifier
* @netns_refund: Physical device, move to init_net on netns exit
* @peer_type: Peer device specific netlink attribute number (e.g. VETH_INFO_PEER)
* @maxtype: Highest device specific netlink attribute number
* @policy: Netlink policy for device specific attribute validation
* @validate: Optional validation function for netlink/changelink parameters
@@ -116,6 +118,7 @@ struct rtnl_link_ops {
void (*setup)(struct net_device *dev);
bool netns_refund;
const u16 peer_type;
unsigned int maxtype;
const struct nla_policy *policy;
int (*validate)(struct nlattr *tb[],
@@ -164,9 +167,6 @@ struct rtnl_link_ops {
int *prividx, int attr);
};
int __rtnl_link_register(struct rtnl_link_ops *ops);
void __rtnl_link_unregister(struct rtnl_link_ops *ops);
int rtnl_link_register(struct rtnl_link_ops *ops);
void rtnl_link_unregister(struct rtnl_link_ops *ops);
-1
View File
@@ -56,7 +56,6 @@ static bool init_net_initialized;
* outside.
*/
DECLARE_RWSEM(pernet_ops_rwsem);
EXPORT_SYMBOL_GPL(pernet_ops_rwsem);
#define MIN_PERNET_OPS_ID \
((sizeof(struct net_generic) + sizeof(void *) - 1) / sizeof(void *))
+189 -70
View File
@@ -258,8 +258,87 @@ bool lockdep_rtnl_net_is_held(struct net *net)
return lockdep_rtnl_is_held() && lockdep_is_held(&net->rtnl_mutex);
}
EXPORT_SYMBOL(lockdep_rtnl_net_is_held);
#else
static int rtnl_net_cmp_locks(const struct net *net_a, const struct net *net_b)
{
/* No need to swap */
return -1;
}
#endif
struct rtnl_nets {
/* ->newlink() needs to freeze 3 netns at most;
* 2 for the new device, 1 for its peer.
*/
struct net *net[3];
unsigned char len;
};
static void rtnl_nets_init(struct rtnl_nets *rtnl_nets)
{
memset(rtnl_nets, 0, sizeof(*rtnl_nets));
}
static void rtnl_nets_destroy(struct rtnl_nets *rtnl_nets)
{
int i;
for (i = 0; i < rtnl_nets->len; i++) {
put_net(rtnl_nets->net[i]);
rtnl_nets->net[i] = NULL;
}
rtnl_nets->len = 0;
}
/**
* rtnl_nets_add - Add netns to be locked before ->newlink().
*
* @rtnl_nets: rtnl_nets pointer passed to ->get_peer_net().
* @net: netns pointer with an extra refcnt held.
*
* The extra refcnt is released in rtnl_nets_destroy().
*/
static void rtnl_nets_add(struct rtnl_nets *rtnl_nets, struct net *net)
{
int i;
DEBUG_NET_WARN_ON_ONCE(rtnl_nets->len == ARRAY_SIZE(rtnl_nets->net));
for (i = 0; i < rtnl_nets->len; i++) {
switch (rtnl_net_cmp_locks(rtnl_nets->net[i], net)) {
case 0:
put_net(net);
return;
case 1:
swap(rtnl_nets->net[i], net);
}
}
rtnl_nets->net[i] = net;
rtnl_nets->len++;
}
static void rtnl_nets_lock(struct rtnl_nets *rtnl_nets)
{
int i;
rtnl_lock();
for (i = 0; i < rtnl_nets->len; i++)
__rtnl_net_lock(rtnl_nets->net[i]);
}
static void rtnl_nets_unlock(struct rtnl_nets *rtnl_nets)
{
int i;
for (i = 0; i < rtnl_nets->len; i++)
__rtnl_net_unlock(rtnl_nets->net[i]);
rtnl_unlock();
}
static struct rtnl_link __rcu *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1];
static inline int rtm_msgindex(int msgtype)
@@ -466,6 +545,7 @@ void __rtnl_unregister_many(const struct rtnl_msg_handler *handlers, int n)
}
EXPORT_SYMBOL_GPL(__rtnl_unregister_many);
static DEFINE_MUTEX(link_ops_mutex);
static LIST_HEAD(link_ops);
static struct rtnl_link_ops *rtnl_link_ops_get(const char *kind, int *srcu_index)
@@ -494,27 +574,20 @@ static void rtnl_link_ops_put(struct rtnl_link_ops *ops, int srcu_index)
}
/**
* __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
*
* The caller must hold the rtnl_mutex. This function should be used
* by drivers that create devices during module initialization. It
* must be called before registering the devices.
*
* Returns 0 on success or a negative error code.
*/
int __rtnl_link_register(struct rtnl_link_ops *ops)
int rtnl_link_register(struct rtnl_link_ops *ops)
{
struct rtnl_link_ops *tmp;
int err;
/* When RTNL is removed, add lock for link_ops. */
ASSERT_RTNL();
list_for_each_entry(tmp, &link_ops, list) {
if (!strcmp(ops->kind, tmp->kind))
return -EEXIST;
}
/* Sanity-check max sizes to avoid stack buffer overflow. */
if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
return -EINVAL;
/* The check for alloc/setup is here because if ops
* does not have that filled up, it is not possible
@@ -528,30 +601,19 @@ int __rtnl_link_register(struct rtnl_link_ops *ops)
if (err)
return err;
mutex_lock(&link_ops_mutex);
list_for_each_entry(tmp, &link_ops, list) {
if (!strcmp(ops->kind, tmp->kind)) {
err = -EEXIST;
goto unlock;
}
}
list_add_tail_rcu(&ops->list, &link_ops);
unlock:
mutex_unlock(&link_ops_mutex);
return 0;
}
EXPORT_SYMBOL_GPL(__rtnl_link_register);
/**
* rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
*
* Returns 0 on success or a negative error code.
*/
int rtnl_link_register(struct rtnl_link_ops *ops)
{
int err;
/* Sanity-check max sizes to avoid stack buffer overflow. */
if (WARN_ON(ops->maxtype > RTNL_MAX_TYPE ||
ops->slave_maxtype > RTNL_SLAVE_MAX_TYPE))
return -EINVAL;
rtnl_lock();
err = __rtnl_link_register(ops);
rtnl_unlock();
return err;
}
EXPORT_SYMBOL_GPL(rtnl_link_register);
@@ -568,27 +630,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops)
unregister_netdevice_many(&list_kill);
}
/**
* __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink.
* @ops: struct rtnl_link_ops * to unregister
*
* The caller must hold the rtnl_mutex and guarantee net_namespace_list
* integrity (hold pernet_ops_rwsem for writing to close the race
* with setup_net() and cleanup_net()).
*/
void __rtnl_link_unregister(struct rtnl_link_ops *ops)
{
struct net *net;
list_del_rcu(&ops->list);
synchronize_srcu(&ops->srcu);
cleanup_srcu_struct(&ops->srcu);
for_each_net(net)
__rtnl_kill_links(net, ops);
}
EXPORT_SYMBOL_GPL(__rtnl_link_unregister);
/* Return with the rtnl_lock held when there are no network
* devices unregistering in any network namespace.
*/
@@ -617,10 +658,22 @@ static void rtnl_lock_unregistering_all(void)
*/
void rtnl_link_unregister(struct rtnl_link_ops *ops)
{
struct net *net;
mutex_lock(&link_ops_mutex);
list_del_rcu(&ops->list);
mutex_unlock(&link_ops_mutex);
synchronize_srcu(&ops->srcu);
cleanup_srcu_struct(&ops->srcu);
/* Close the race with setup_net() and cleanup_net() */
down_write(&pernet_ops_rwsem);
rtnl_lock_unregistering_all();
__rtnl_link_unregister(ops);
for_each_net(net)
__rtnl_kill_links(net, ops);
rtnl_unlock();
up_write(&pernet_ops_rwsem);
}
@@ -2459,9 +2512,10 @@ int rtnl_nla_parse_ifinfomsg(struct nlattr **tb, const struct nlattr *nla_peer,
}
EXPORT_SYMBOL(rtnl_nla_parse_ifinfomsg);
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
static struct net *rtnl_link_get_net_ifla(struct nlattr *tb[])
{
struct net *net;
struct net *net = NULL;
/* Examine the link attributes and figure out which
* network namespace we are talking about.
*/
@@ -2469,8 +2523,17 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
net = get_net_ns_by_pid(nla_get_u32(tb[IFLA_NET_NS_PID]));
else if (tb[IFLA_NET_NS_FD])
net = get_net_ns_by_fd(nla_get_u32(tb[IFLA_NET_NS_FD]));
else
return net;
}
struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[])
{
struct net *net = rtnl_link_get_net_ifla(tb);
if (!net)
net = get_net(src_net);
return net;
}
EXPORT_SYMBOL(rtnl_link_get_net);
@@ -3316,6 +3379,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *net = sock_net(skb->sk);
struct nlattr *tb[IFLA_MAX+1];
struct net_device *dev = NULL;
struct rtnl_nets rtnl_nets;
struct net *tgt_net;
int err;
@@ -3334,6 +3398,12 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto errout;
}
rtnl_nets_init(&rtnl_nets);
rtnl_nets_add(&rtnl_nets, get_net(net));
rtnl_nets_add(&rtnl_nets, tgt_net);
rtnl_nets_lock(&rtnl_nets);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
@@ -3346,7 +3416,7 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
else if (!err)
err = -ENODEV;
put_net(tgt_net);
rtnl_nets_unlock(&rtnl_nets);
errout:
return err;
}
@@ -3431,6 +3501,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
return PTR_ERR(tgt_net);
}
rtnl_net_lock(tgt_net);
if (ifm->ifi_index > 0)
dev = __dev_get_by_index(tgt_net, ifm->ifi_index);
else if (tb[IFLA_IFNAME] || tb[IFLA_ALT_IFNAME])
@@ -3445,6 +3517,8 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh,
else
err = -EINVAL;
rtnl_net_unlock(tgt_net);
if (netnsid >= 0)
put_net(tgt_net);
@@ -3732,6 +3806,37 @@ out_unregister:
goto out;
}
static int rtnl_add_peer_net(struct rtnl_nets *rtnl_nets,
const struct rtnl_link_ops *ops,
struct nlattr *data[],
struct netlink_ext_ack *extack)
{
struct nlattr *tb[IFLA_MAX + 1];
struct net *net;
int err;
if (!data || !data[ops->peer_type])
return 0;
err = rtnl_nla_parse_ifinfomsg(tb, data[ops->peer_type], extack);
if (err < 0)
return err;
if (ops->validate) {
err = ops->validate(tb, NULL, extack);
if (err < 0)
return err;
}
net = rtnl_link_get_net_ifla(tb);
if (IS_ERR(net))
return PTR_ERR(net);
if (net)
rtnl_nets_add(rtnl_nets, net);
return 0;
}
static int __rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
const struct rtnl_link_ops *ops,
struct net *tgt_net, struct net *link_net,
@@ -3793,6 +3898,7 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
struct net *tgt_net, *link_net = NULL;
struct rtnl_link_ops *ops = NULL;
struct rtnl_newlink_tbs *tbs;
struct rtnl_nets rtnl_nets;
int ops_srcu_index;
int ret;
@@ -3828,14 +3934,14 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
ops = rtnl_link_ops_get(kind, &ops_srcu_index);
#ifdef CONFIG_MODULES
if (!ops) {
__rtnl_unlock();
request_module("rtnl-link-%s", kind);
rtnl_lock();
ops = rtnl_link_ops_get(kind, &ops_srcu_index);
}
#endif
}
rtnl_nets_init(&rtnl_nets);
if (ops) {
if (ops->maxtype > RTNL_MAX_TYPE) {
ret = -EINVAL;
@@ -3857,14 +3963,22 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
if (ret < 0)
goto put_ops;
}
if (ops->peer_type) {
ret = rtnl_add_peer_net(&rtnl_nets, ops, data, extack);
if (ret < 0)
goto put_ops;
}
}
tgt_net = rtnl_link_get_net_capable(skb, sock_net(skb->sk), tb, CAP_NET_ADMIN);
if (IS_ERR(tgt_net)) {
ret = PTR_ERR(tgt_net);
goto put_ops;
goto put_net;
}
rtnl_nets_add(&rtnl_nets, tgt_net);
if (tb[IFLA_LINK_NETNSID]) {
int id = nla_get_s32(tb[IFLA_LINK_NETNSID]);
@@ -3875,18 +3989,20 @@ static int rtnl_newlink(struct sk_buff *skb, struct nlmsghdr *nlh,
goto put_net;
}
rtnl_nets_add(&rtnl_nets, link_net);
if (!netlink_ns_capable(skb, link_net->user_ns, CAP_NET_ADMIN)) {
ret = -EPERM;
goto put_net;
}
}
rtnl_nets_lock(&rtnl_nets);
ret = __rtnl_newlink(skb, nlh, ops, tgt_net, link_net, tbs, data, extack);
rtnl_nets_unlock(&rtnl_nets);
put_net:
if (link_net)
put_net(link_net);
put_net(tgt_net);
rtnl_nets_destroy(&rtnl_nets);
put_ops:
if (ops)
rtnl_link_ops_put(ops, ops_srcu_index);
@@ -6887,11 +7003,14 @@ static struct pernet_operations rtnetlink_net_ops = {
};
static const struct rtnl_msg_handler rtnetlink_rtnl_msg_handlers[] __initconst = {
{.msgtype = RTM_NEWLINK, .doit = rtnl_newlink},
{.msgtype = RTM_DELLINK, .doit = rtnl_dellink},
{.msgtype = RTM_NEWLINK, .doit = rtnl_newlink,
.flags = RTNL_FLAG_DOIT_PERNET},
{.msgtype = RTM_DELLINK, .doit = rtnl_dellink,
.flags = RTNL_FLAG_DOIT_PERNET_WIP},
{.msgtype = RTM_GETLINK, .doit = rtnl_getlink,
.dumpit = rtnl_dump_ifinfo, .flags = RTNL_FLAG_DUMP_SPLIT_NLM_DONE},
{.msgtype = RTM_SETLINK, .doit = rtnl_setlink},
{.msgtype = RTM_SETLINK, .doit = rtnl_setlink,
.flags = RTNL_FLAG_DOIT_PERNET_WIP},
{.msgtype = RTM_GETADDR, .dumpit = rtnl_dump_all},
{.msgtype = RTM_GETROUTE, .dumpit = rtnl_dump_all},
{.msgtype = RTM_GETNETCONF, .dumpit = rtnl_dump_all},