From f1a8d107d91db7923518abd987ddcb3cd6ea6af4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:17 -0700 Subject: [PATCH 1/7] ipv6: Remove rcu_read_lock() in fib6_get_table(). Once allocated, the IPv6 routing table is not freed until netns is dismantled. fib6_get_table() uses rcu_read_lock() while iterating net->ipv6.fib_table_hash[], but it's not needed and rather confusing. Because some callers have this pattern, table = fib6_get_table(); rcu_read_lock(); /* ... use table here ... */ rcu_read_unlock(); [ See: addrconf_get_prefix_route(), ip6_route_del(), rt6_get_route_info(), rt6_get_dflt_router() ] and this looks illegal but is actually safe. Let's remove rcu_read_lock() in fib6_get_table() and pass true to the last argument of hlist_for_each_entry_rcu() to bypass the RCU check. Note that protection is not needed but RCU helper is used to avoid data-race. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 1f860340690c..88770ecd2da1 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -281,22 +281,20 @@ EXPORT_SYMBOL_GPL(fib6_new_table); struct fib6_table *fib6_get_table(struct net *net, u32 id) { - struct fib6_table *tb; struct hlist_head *head; - unsigned int h; + struct fib6_table *tb; - if (id == 0) + if (!id) id = RT6_TABLE_MAIN; - h = id & (FIB6_TABLE_HASHSZ - 1); - rcu_read_lock(); - head = &net->ipv6.fib_table_hash[h]; - hlist_for_each_entry_rcu(tb, head, tb6_hlist) { - if (tb->tb6_id == id) { - rcu_read_unlock(); + + head = &net->ipv6.fib_table_hash[id & (FIB6_TABLE_HASHSZ - 1)]; + + /* See comment in fib6_link_table(). RCU is not required, + * but rcu_dereference_raw() is used to avoid data-race. + */ + hlist_for_each_entry_rcu(tb, head, tb6_hlist, true) + if (tb->tb6_id == id) return tb; - } - } - rcu_read_unlock(); return NULL; } From f0a56c17e64bb5e7cdb9295df2b5fc21e4949005 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:18 -0700 Subject: [PATCH 2/7] inet: Remove rtnl_is_held arg of lwtunnel_valid_encap_type(_attr)?(). Commit f130a0cc1b4f ("inet: fix lwtunnel_valid_encap_type() lock imbalance") added the rtnl_is_held argument as a temporary fix while I'm converting nexthop and IPv6 routing table to per-netns RTNL or RCU. Now all callers of lwtunnel_valid_encap_type() do not hold RTNL. Let's remove the argument. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- include/net/lwtunnel.h | 13 +++++-------- net/core/lwtunnel.c | 15 +++------------ net/ipv4/fib_frontend.c | 4 ++-- net/ipv4/nexthop.c | 3 +-- net/ipv6/route.c | 6 ++---- 5 files changed, 13 insertions(+), 28 deletions(-) diff --git a/include/net/lwtunnel.h b/include/net/lwtunnel.h index 39cd50300a18..c306ebe379a0 100644 --- a/include/net/lwtunnel.h +++ b/include/net/lwtunnel.h @@ -116,11 +116,9 @@ int lwtunnel_encap_add_ops(const struct lwtunnel_encap_ops *op, int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, unsigned int num); int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held); + struct netlink_ext_ack *extack); int lwtunnel_build_state(struct net *net, u16 encap_type, struct nlattr *encap, unsigned int family, const void *cfg, @@ -203,15 +201,14 @@ static inline int lwtunnel_encap_del_ops(const struct lwtunnel_encap_ops *op, } static inline int lwtunnel_valid_encap_type(u16 encap_type, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { NL_SET_ERR_MSG(extack, "CONFIG_LWTUNNEL is not enabled in this kernel"); return -EOPNOTSUPP; } + static inline int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int len, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { /* return 0 since we are not walking attr looking for * RTA_ENCAP_TYPE attribute on nexthops. diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 60f27cb4e54f..f9d76d85d04f 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -149,8 +149,7 @@ int lwtunnel_build_state(struct net *net, u16 encap_type, } EXPORT_SYMBOL_GPL(lwtunnel_build_state); -int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, - bool rtnl_is_held) +int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack) { const struct lwtunnel_encap_ops *ops; int ret = -EINVAL; @@ -167,12 +166,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, const char *encap_type_str = lwtunnel_encap_str(encap_type); if (encap_type_str) { - if (rtnl_is_held) - __rtnl_unlock(); request_module("rtnl-lwt-%s", encap_type_str); - if (rtnl_is_held) - rtnl_lock(); - ops = rcu_access_pointer(lwtun_encaps[encap_type]); } } @@ -186,8 +180,7 @@ int lwtunnel_valid_encap_type(u16 encap_type, struct netlink_ext_ack *extack, EXPORT_SYMBOL_GPL(lwtunnel_valid_encap_type); int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, - struct netlink_ext_ack *extack, - bool rtnl_is_held) + struct netlink_ext_ack *extack) { struct rtnexthop *rtnh = (struct rtnexthop *)attr; struct nlattr *nla_entype; @@ -208,9 +201,7 @@ int lwtunnel_valid_encap_type_attr(struct nlattr *attr, int remaining, } encap_type = nla_get_u16(nla_entype); - if (lwtunnel_valid_encap_type(encap_type, - extack, - rtnl_is_held) != 0) + if (lwtunnel_valid_encap_type(encap_type, extack)) return -EOPNOTSUPP; } } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 57f088e5540e..fd1e1507a224 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -807,7 +807,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_MULTIPATH: err = lwtunnel_valid_encap_type_attr(nla_data(attr), nla_len(attr), - extack, false); + extack); if (err < 0) goto errout; cfg->fc_mp = nla_data(attr); @@ -825,7 +825,7 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, case RTA_ENCAP_TYPE: cfg->fc_encap_type = nla_get_u16(attr); err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, false); + extack); if (err < 0) goto errout; break; diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 823e4a783d2b..4397e89d3123 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -3180,8 +3180,7 @@ static int rtm_to_nh_config(struct net *net, struct sk_buff *skb, } cfg->nh_encap_type = nla_get_u16(tb[NHA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->nh_encap_type, - extack, false); + err = lwtunnel_valid_encap_type(cfg->nh_encap_type, extack); if (err < 0) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 44300962230b..6baf177c529b 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5172,8 +5172,7 @@ static int rtm_to_fib6_multipath_config(struct fib6_config *cfg, rtnh = rtnh_next(rtnh, &remaining); } while (rtnh_ok(rtnh, remaining)); - return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, - extack, false); + return lwtunnel_valid_encap_type_attr(cfg->fc_mp, cfg->fc_mp_len, extack); } static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, @@ -5310,8 +5309,7 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (tb[RTA_ENCAP_TYPE]) { cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]); - err = lwtunnel_valid_encap_type(cfg->fc_encap_type, - extack, false); + err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack); if (err < 0) goto errout; } From 8e5f1bb812741821e2a8ac221fba45cab6c73e43 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:19 -0700 Subject: [PATCH 3/7] ipv6: Narrow down RCU critical section in inet6_rtm_newroute(). Commit 169fd62799e8 ("ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE.") added rcu_read_lock() covering ip6_route_info_create_nh() and __ip6_ins_rt() to guarantee that nexthop and netdev will not go away. However, as reported by syzkaller [0], ip_tun_build_state() calls dst_cache_init() with GFP_KERNEL during the RCU critical section. ip6_route_info_create_nh() fetches nexthop or netdev depending on whether RTA_NH_ID is set, and struct fib6_info holds a refcount of either of them by nexthop_get() or netdev_get_by_index(). netdev_get_by_index() looks up a dev and calls dev_hold() under RCU. So, we need RCU only around nexthop_find_by_id() and nexthop_get() ( and a few more nexthop code). Let's add rcu_read_lock() there and remove rcu_read_lock() in ip6_route_add() and ip6_route_multipath_add(). Now these functions called from fib6_add() need RCU: - inet6_rt_notify() - fib6_drop_pcpu_from() (via fib6_purge_rt()) - rt6_flush_exceptions() (via fib6_purge_rt()) - ip6_ignore_linkdown() (via rt6_multipath_rebalance()) All callers of inet6_rt_notify() need RCU, so rcu_read_lock() is added there. [0]: [ BUG: Invalid wait context ] 6.15.0-rc4-syzkaller-00746-g836b313a14a3 #0 Tainted: G W ---------------------------- syz-executor234/5832 is trying to lock: ffffffff8e021688 (pcpu_alloc_mutex){+.+.}-{4:4}, at: pcpu_alloc_noprof+0x284/0x16b0 mm/percpu.c:1782 other info that might help us debug this: context-{5:5} 1 lock held by syz-executor234/5832: 0: ffffffff8df3b860 (rcu_read_lock){....}-{1:3}, at: rcu_lock_acquire include/linux/rcupdate.h:331 [inline] 0: ffffffff8df3b860 (rcu_read_lock){....}-{1:3}, at: rcu_read_lock include/linux/rcupdate.h:841 [inline] 0: ffffffff8df3b860 (rcu_read_lock){....}-{1:3}, at: ip6_route_add+0x4d/0x2f0 net/ipv6/route.c:3913 stack backtrace: CPU: 0 UID: 0 PID: 5832 Comm: syz-executor234 Tainted: G W 6.15.0-rc4-syzkaller-00746-g836b313a14a3 #0 PREEMPT(full) Tainted: [W]=WARN Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 04/29/2025 Call Trace: dump_stack_lvl+0x189/0x250 lib/dump_stack.c:120 print_lock_invalid_wait_context kernel/locking/lockdep.c:4831 [inline] check_wait_context kernel/locking/lockdep.c:4903 [inline] __lock_acquire+0xbcf/0xd20 kernel/locking/lockdep.c:5185 lock_acquire+0x120/0x360 kernel/locking/lockdep.c:5866 __mutex_lock_common kernel/locking/mutex.c:601 [inline] __mutex_lock+0x182/0xe80 kernel/locking/mutex.c:746 pcpu_alloc_noprof+0x284/0x16b0 mm/percpu.c:1782 dst_cache_init+0x37/0xc0 net/core/dst_cache.c:145 ip_tun_build_state+0x193/0x6b0 net/ipv4/ip_tunnel_core.c:687 lwtunnel_build_state+0x381/0x4c0 net/core/lwtunnel.c:137 fib_nh_common_init+0x129/0x460 net/ipv4/fib_semantics.c:635 fib6_nh_init+0x15e4/0x2030 net/ipv6/route.c:3669 ip6_route_info_create_nh+0x139/0x870 net/ipv6/route.c:3866 ip6_route_add+0xf6/0x2f0 net/ipv6/route.c:3915 inet6_rtm_newroute+0x284/0x1c50 net/ipv6/route.c:5732 rtnetlink_rcv_msg+0x7cc/0xb70 net/core/rtnetlink.c:6955 netlink_rcv_skb+0x219/0x490 net/netlink/af_netlink.c:2534 netlink_unicast_kernel net/netlink/af_netlink.c:1313 [inline] netlink_unicast+0x758/0x8d0 net/netlink/af_netlink.c:1339 netlink_sendmsg+0x805/0xb30 net/netlink/af_netlink.c:1883 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x219/0x270 net/socket.c:727 ____sys_sendmsg+0x505/0x830 net/socket.c:2566 ___sys_sendmsg+0x21f/0x2a0 net/socket.c:2620 __sys_sendmsg net/socket.c:2652 [inline] __do_sys_sendmsg net/socket.c:2657 [inline] __se_sys_sendmsg net/socket.c:2655 [inline] __x64_sys_sendmsg+0x19b/0x260 net/socket.c:2655 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] do_syscall_64+0xf6/0x210 arch/x86/entry/syscall_64.c:94 Fixes: 169fd62799e8 ("ipv6: Get rid of RTNL for SIOCADDRT and RTM_NEWROUTE.") Reported-by: syzbot+bcc12d6799364500fbec@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=bcc12d6799364500fbec Reported-by: Eric Dumazet Closes: https://lore.kernel.org/netdev/CANn89i+r1cGacVC_6n3-A-WSkAa_Nr+pmxJ7Gt+oP-P9by2aGw@mail.gmail.com/ Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/ip6_fib.c | 9 +++++++-- net/ipv6/route.c | 31 ++++++++++++++++++------------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 88770ecd2da1..7094d7708686 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1027,8 +1027,9 @@ static void fib6_drop_pcpu_from(struct fib6_info *f6i, .table = table }; - nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, - &arg); + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from, &arg); + rcu_read_unlock(); } else { struct fib6_nh *fib6_nh; @@ -1221,7 +1222,9 @@ next_iter: fib6_nsiblings++; } BUG_ON(fib6_nsiblings != rt->fib6_nsiblings); + rcu_read_lock(); rt6_multipath_rebalance(temp_sibling); + rcu_read_unlock(); } /* @@ -1264,7 +1267,9 @@ add: sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; list_del_rcu(&rt->fib6_siblings); + rcu_read_lock(); rt6_multipath_rebalance(next_sibling); + rcu_read_unlock(); return err; } } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6baf177c529b..a87091dd06b1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1820,11 +1820,13 @@ static int rt6_nh_flush_exceptions(struct fib6_nh *nh, void *arg) void rt6_flush_exceptions(struct fib6_info *f6i) { - if (f6i->nh) - nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, - f6i); - else + if (f6i->nh) { + rcu_read_lock(); + nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_flush_exceptions, f6i); + rcu_read_unlock(); + } else { fib6_nh_flush_exceptions(f6i->fib6_nh, f6i); + } } /* Find cached rt in the hash table inside passed in rt @@ -3841,6 +3843,8 @@ static int ip6_route_info_create_nh(struct fib6_info *rt, if (cfg->fc_nh_id) { struct nexthop *nh; + rcu_read_lock(); + nh = nexthop_find_by_id(net, cfg->fc_nh_id); if (!nh) { err = -EINVAL; @@ -3860,6 +3864,8 @@ static int ip6_route_info_create_nh(struct fib6_info *rt, rt->nh = nh; fib6_nh = nexthop_fib6_nh(rt->nh); + + rcu_read_unlock(); } else { int addr_type; @@ -3895,6 +3901,7 @@ out_release: fib6_info_release(rt); return err; out_free: + rcu_read_unlock(); ip_fib_metrics_put(rt->fib6_metrics); kfree(rt); return err; @@ -3910,16 +3917,12 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); - rcu_read_lock(); - err = ip6_route_info_create_nh(rt, cfg, extack); if (err) - goto unlock; + return err; err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack); fib6_info_release(rt); -unlock: - rcu_read_unlock(); return err; } @@ -5534,8 +5537,6 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, if (err) return err; - rcu_read_lock(); - err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack); if (err) goto cleanup; @@ -5627,8 +5628,6 @@ add_errout: } cleanup: - rcu_read_unlock(); - list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, list) { fib6_info_release(nh->fib6_info); list_del(&nh->list); @@ -6410,6 +6409,8 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; + rcu_read_lock(); + skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; @@ -6422,10 +6423,14 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, kfree_skb(skb); goto errout; } + + rcu_read_unlock(); + rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, info->nlh, GFP_ATOMIC); return; errout: + rcu_read_unlock(); rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err); } From cefe6e131cc4f032110efe1687295e133f3d5964 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:20 -0700 Subject: [PATCH 4/7] Revert "ipv6: sr: switch to GFP_ATOMIC flag to allocate memory during seg6local LWT setup" The previous patch fixed the same issue mentioned in commit 14a0087e7236 ("ipv6: sr: switch to GFP_ATOMIC flag to allocate memory during seg6local LWT setup"). Let's revert it. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Andrea Mayer Link: https://patch.msgid.link/20250516022759.44392-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/seg6_local.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index ee5e448cc7a8..ac1dbd492c22 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -1671,7 +1671,7 @@ static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt, if (!seg6_validate_srh(srh, len, false)) return -EINVAL; - slwt->srh = kmemdup(srh, len, GFP_ATOMIC); + slwt->srh = kmemdup(srh, len, GFP_KERNEL); if (!slwt->srh) return -ENOMEM; @@ -1911,7 +1911,7 @@ static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt, if (!tb[SEG6_LOCAL_BPF_PROG] || !tb[SEG6_LOCAL_BPF_PROG_NAME]) return -EINVAL; - slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_ATOMIC); + slwt->bpf.name = nla_memdup(tb[SEG6_LOCAL_BPF_PROG_NAME], GFP_KERNEL); if (!slwt->bpf.name) return -ENOMEM; @@ -1994,7 +1994,7 @@ static int parse_nla_counters(struct nlattr **attrs, return -EINVAL; /* counters are always zero initialized */ - pcounters = seg6_local_alloc_pcpu_counters(GFP_ATOMIC); + pcounters = seg6_local_alloc_pcpu_counters(GFP_KERNEL); if (!pcounters) return -ENOMEM; From 5e4a8cc7beb8567293e6d4230b14e95167759214 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:21 -0700 Subject: [PATCH 5/7] Revert "ipv6: Factorise ip6_route_multipath_add()." Commit 71c0efb6d12f ("ipv6: Factorise ip6_route_multipath_add().") split a loop in ip6_route_multipath_add() so that we can put rcu_read_lock() between ip6_route_info_create() and ip6_route_info_create_nh(). We no longer need to do so as ip6_route_info_create_nh() does not require RCU now. Let's revert the commit to simplify the code. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 193 +++++++++++++++++------------------------------ 1 file changed, 70 insertions(+), 123 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a87091dd06b1..96ae21da9961 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -5335,131 +5335,29 @@ struct rt6_nh { struct fib6_info *fib6_info; struct fib6_config r_cfg; struct list_head list; - int weight; }; -static void ip6_route_mpath_info_cleanup(struct list_head *rt6_nh_list) +static int ip6_route_info_append(struct list_head *rt6_nh_list, + struct fib6_info *rt, + struct fib6_config *r_cfg) { - struct rt6_nh *nh, *nh_next; + struct rt6_nh *nh; - list_for_each_entry_safe(nh, nh_next, rt6_nh_list, list) { - struct fib6_info *rt = nh->fib6_info; - - if (rt) { - free_percpu(rt->fib6_nh->nh_common.nhc_pcpu_rth_output); - free_percpu(rt->fib6_nh->rt6i_pcpu); - ip_fib_metrics_put(rt->fib6_metrics); - kfree(rt); - } - - list_del(&nh->list); - kfree(nh); + list_for_each_entry(nh, rt6_nh_list, list) { + /* check if fib6_info already exists */ + if (rt6_duplicate_nexthop(nh->fib6_info, rt)) + return -EEXIST; } -} -static int ip6_route_mpath_info_create(struct list_head *rt6_nh_list, - struct fib6_config *cfg, - struct netlink_ext_ack *extack) -{ - struct rtnexthop *rtnh; - int remaining; - int err; + nh = kzalloc(sizeof(*nh), GFP_KERNEL); + if (!nh) + return -ENOMEM; - remaining = cfg->fc_mp_len; - rtnh = (struct rtnexthop *)cfg->fc_mp; - - /* Parse a Multipath Entry and build a list (rt6_nh_list) of - * fib6_info structs per nexthop - */ - while (rtnh_ok(rtnh, remaining)) { - struct fib6_config r_cfg; - struct fib6_info *rt; - struct rt6_nh *nh; - int attrlen; - - nh = kzalloc(sizeof(*nh), GFP_KERNEL); - if (!nh) { - err = -ENOMEM; - goto err; - } - - list_add_tail(&nh->list, rt6_nh_list); - - memcpy(&r_cfg, cfg, sizeof(*cfg)); - if (rtnh->rtnh_ifindex) - r_cfg.fc_ifindex = rtnh->rtnh_ifindex; - - attrlen = rtnh_attrlen(rtnh); - if (attrlen > 0) { - struct nlattr *nla, *attrs = rtnh_attrs(rtnh); - - nla = nla_find(attrs, attrlen, RTA_GATEWAY); - if (nla) { - r_cfg.fc_gateway = nla_get_in6_addr(nla); - r_cfg.fc_flags |= RTF_GATEWAY; - } - - r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); - nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); - if (nla) - r_cfg.fc_encap_type = nla_get_u16(nla); - } - - r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); - - rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); - if (IS_ERR(rt)) { - err = PTR_ERR(rt); - goto err; - } - - nh->fib6_info = rt; - nh->weight = rtnh->rtnh_hops + 1; - memcpy(&nh->r_cfg, &r_cfg, sizeof(r_cfg)); - - rtnh = rtnh_next(rtnh, &remaining); - } + nh->fib6_info = rt; + memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg)); + list_add_tail(&nh->list, rt6_nh_list); return 0; -err: - ip6_route_mpath_info_cleanup(rt6_nh_list); - return err; -} - -static int ip6_route_mpath_info_create_nh(struct list_head *rt6_nh_list, - struct netlink_ext_ack *extack) -{ - struct rt6_nh *nh, *nh_next, *nh_tmp; - LIST_HEAD(tmp); - int err; - - list_for_each_entry_safe(nh, nh_next, rt6_nh_list, list) { - struct fib6_info *rt = nh->fib6_info; - - err = ip6_route_info_create_nh(rt, &nh->r_cfg, extack); - if (err) { - nh->fib6_info = NULL; - goto err; - } - - rt->fib6_nh->fib_nh_weight = nh->weight; - - list_move_tail(&nh->list, &tmp); - - list_for_each_entry(nh_tmp, rt6_nh_list, list) { - /* check if fib6_info already exists */ - if (rt6_duplicate_nexthop(nh_tmp->fib6_info, rt)) { - err = -EEXIST; - goto err; - } - } - } -out: - list_splice(&tmp, rt6_nh_list); - return err; -err: - ip6_route_mpath_info_cleanup(rt6_nh_list); - goto out; } static void ip6_route_mpath_notify(struct fib6_info *rt, @@ -5519,11 +5417,16 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, struct fib6_info *rt_notif = NULL, *rt_last = NULL; struct nl_info *info = &cfg->fc_nlinfo; struct rt6_nh *nh, *nh_safe; + struct fib6_config r_cfg; + struct rtnexthop *rtnh; LIST_HEAD(rt6_nh_list); struct rt6_nh *err_nh; + struct fib6_info *rt; __u16 nlflags; - int nhn = 0; + int remaining; + int attrlen; int replace; + int nhn = 0; int err; replace = (cfg->fc_nlinfo.nlh && @@ -5533,13 +5436,57 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND) nlflags |= NLM_F_APPEND; - err = ip6_route_mpath_info_create(&rt6_nh_list, cfg, extack); - if (err) - return err; + remaining = cfg->fc_mp_len; + rtnh = (struct rtnexthop *)cfg->fc_mp; - err = ip6_route_mpath_info_create_nh(&rt6_nh_list, extack); - if (err) - goto cleanup; + /* Parse a Multipath Entry and build a list (rt6_nh_list) of + * fib6_info structs per nexthop + */ + while (rtnh_ok(rtnh, remaining)) { + memcpy(&r_cfg, cfg, sizeof(*cfg)); + if (rtnh->rtnh_ifindex) + r_cfg.fc_ifindex = rtnh->rtnh_ifindex; + + attrlen = rtnh_attrlen(rtnh); + if (attrlen > 0) { + struct nlattr *nla, *attrs = rtnh_attrs(rtnh); + + nla = nla_find(attrs, attrlen, RTA_GATEWAY); + if (nla) { + r_cfg.fc_gateway = nla_get_in6_addr(nla); + r_cfg.fc_flags |= RTF_GATEWAY; + } + + r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP); + nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE); + if (nla) + r_cfg.fc_encap_type = nla_get_u16(nla); + } + + r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK); + rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack); + if (IS_ERR(rt)) { + err = PTR_ERR(rt); + rt = NULL; + goto cleanup; + } + + err = ip6_route_info_create_nh(rt, &r_cfg, extack); + if (err) { + rt = NULL; + goto cleanup; + } + + rt->fib6_nh->fib_nh_weight = rtnh->rtnh_hops + 1; + + err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg); + if (err) { + fib6_info_release(rt); + goto cleanup; + } + + rtnh = rtnh_next(rtnh, &remaining); + } /* for add and replace send one notification with all nexthops. * Skip the notification in fib6_add_rt2node and send one with From d465bd07d16e37cd3aa25539ab187b372853808d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:22 -0700 Subject: [PATCH 6/7] ipv6: Pass gfp_flags down to ip6_route_info_create_nh(). Since commit c4837b9853e5 ("ipv6: Split ip6_route_info_create()."), ip6_route_info_create_nh() uses GFP_ATOMIC as it was expected to be called under RCU. Now, we can call it without RCU and use GFP_KERNEL. Let's pass gfp_flags to ip6_route_info_create_nh(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv6/route.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 96ae21da9961..dda913ebd2d3 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3834,6 +3834,7 @@ err: static int ip6_route_info_create_nh(struct fib6_info *rt, struct fib6_config *cfg, + gfp_t gfp_flags, struct netlink_ext_ack *extack) { struct net *net = cfg->fc_nlinfo.nl_net; @@ -3869,7 +3870,7 @@ static int ip6_route_info_create_nh(struct fib6_info *rt, } else { int addr_type; - err = fib6_nh_init(net, rt->fib6_nh, cfg, GFP_ATOMIC, extack); + err = fib6_nh_init(net, rt->fib6_nh, cfg, gfp_flags, extack); if (err) goto out_release; @@ -3917,7 +3918,7 @@ int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags, if (IS_ERR(rt)) return PTR_ERR(rt); - err = ip6_route_info_create_nh(rt, cfg, extack); + err = ip6_route_info_create_nh(rt, cfg, gfp_flags, extack); if (err) return err; @@ -4707,7 +4708,7 @@ struct fib6_info *addrconf_f6i_alloc(struct net *net, if (IS_ERR(f6i)) return f6i; - err = ip6_route_info_create_nh(f6i, &cfg, extack); + err = ip6_route_info_create_nh(f6i, &cfg, gfp_flags, extack); if (err) return ERR_PTR(err); @@ -5471,7 +5472,7 @@ static int ip6_route_multipath_add(struct fib6_config *cfg, goto cleanup; } - err = ip6_route_info_create_nh(rt, &r_cfg, extack); + err = ip6_route_info_create_nh(rt, &r_cfg, GFP_KERNEL, extack); if (err) { rt = NULL; goto cleanup; From 002dba13c824f1cf86f618f0d23d1f0ad3c93bbb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 15 May 2025 19:27:23 -0700 Subject: [PATCH 7/7] ipv6: Revert two per-cpu var allocation for RTM_NEWROUTE. These two commits preallocated two per-cpu variables in ip6_route_info_create() as fib_nh_common_init() and fib6_nh_init() were expected to be called under RCU. * commit d27b9c40dbd6 ("ipv6: Preallocate nhc_pcpu_rth_output in ip6_route_info_create().") * commit 5720a328c3e9 ("ipv6: Preallocate rt->fib6_nh->rt6i_pcpu in ip6_route_info_create().") Now these functions can be called without RCU and can use GFP_KERNEL. Let's revert the commits. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250516022759.44392-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski --- net/ipv4/fib_semantics.c | 10 ++++------ net/ipv6/route.c | 34 +++------------------------------- 2 files changed, 7 insertions(+), 37 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index dabe2b7044ab..d643bd1a0d9d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -617,12 +617,10 @@ int fib_nh_common_init(struct net *net, struct fib_nh_common *nhc, { int err; - if (!nhc->nhc_pcpu_rth_output) { - nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, - gfp_flags); - if (!nhc->nhc_pcpu_rth_output) - return -ENOMEM; - } + nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, + gfp_flags); + if (!nhc->nhc_pcpu_rth_output) + return -ENOMEM; if (encap) { struct lwtunnel_state *lwtstate; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index dda913ebd2d3..0143262094b0 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -3674,12 +3674,10 @@ int fib6_nh_init(struct net *net, struct fib6_nh *fib6_nh, goto out; pcpu_alloc: + fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); if (!fib6_nh->rt6i_pcpu) { - fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!fib6_nh->rt6i_pcpu) { - err = -ENOMEM; - goto out; - } + err = -ENOMEM; + goto out; } fib6_nh->fib_nh_dev = dev; @@ -3739,24 +3737,6 @@ void fib6_nh_release_dsts(struct fib6_nh *fib6_nh) } } -static int fib6_nh_prealloc_percpu(struct fib6_nh *fib6_nh, gfp_t gfp_flags) -{ - struct fib_nh_common *nhc = &fib6_nh->nh_common; - - fib6_nh->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, gfp_flags); - if (!fib6_nh->rt6i_pcpu) - return -ENOMEM; - - nhc->nhc_pcpu_rth_output = alloc_percpu_gfp(struct rtable __rcu *, - gfp_flags); - if (!nhc->nhc_pcpu_rth_output) { - free_percpu(fib6_nh->rt6i_pcpu); - return -ENOMEM; - } - - return 0; -} - static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, gfp_t gfp_flags, struct netlink_ext_ack *extack) @@ -3794,12 +3774,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, goto free; } - if (!cfg->fc_nh_id) { - err = fib6_nh_prealloc_percpu(&rt->fib6_nh[0], gfp_flags); - if (err) - goto free_metrics; - } - if (cfg->fc_flags & RTF_ADDRCONF) rt->dst_nocount = true; @@ -3824,8 +3798,6 @@ static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg, rt->fib6_src.plen = cfg->fc_src_len; #endif return rt; -free_metrics: - ip_fib_metrics_put(rt->fib6_metrics); free: kfree(rt); err: