From ae9d5b19b322d4b557efda684da2f4df21670ef8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:34 +0000 Subject: [PATCH 1/4] tcp: use RCU in __inet{6}_check_established() When __inet_hash_connect() has to try many 4-tuples before finding an available one, we see a high spinlock cost from __inet_check_established() and/or __inet6_check_established(). This patch adds an RCU lookup to avoid the spinlock acquisition when the 4-tuple is found in the hash table. Note that there are still spin_lock_bh() calls in __inet_hash_connect() to protect inet_bind_hashbucket, this will be fixed later in this series. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Tested-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-2-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_hashtables.c | 19 ++++++++++++++++--- net/ipv6/inet6_hashtables.c | 19 ++++++++++++++++--- 2 files changed, 32 insertions(+), 6 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 9bfcfd016e18..46d39aa2199e 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -551,11 +551,24 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, unsigned int hash = inet_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + rcu_read_lock(); + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + rcu_read_unlock(); + return -EADDRNOTAVAIL; + } + rcu_read_unlock(); + + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 9ec05e354baa..3604a5cae5d2 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -276,11 +276,24 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr, inet->inet_dport); struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); - spinlock_t *lock = inet_ehash_lockp(hinfo, hash); - struct sock *sk2; - const struct hlist_nulls_node *node; struct inet_timewait_sock *tw = NULL; + const struct hlist_nulls_node *node; + struct sock *sk2; + spinlock_t *lock; + rcu_read_lock(); + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + rcu_read_unlock(); + return -EADDRNOTAVAIL; + } + rcu_read_unlock(); + + lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); sk_nulls_for_each(sk2, node, &head->chain) { From ca79d80b0b9f42362a893f06413a9fe91811158a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:35 +0000 Subject: [PATCH 2/4] tcp: optimize inet_use_bhash2_on_bind() There is no reason to call ipv6_addr_type(). Instead, use highly optimized ipv6_addr_any() and ipv6_addr_v4mapped(). Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/ipv4/inet_connection_sock.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index bf9ce0c19657..b4e514da22b6 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) if (sk->sk_family == AF_INET6) { - int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr); - - if (addr_type == IPV6_ADDR_ANY) + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) return false; - if (addr_type != IPV6_ADDR_MAPPED) + if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr)) return true; } #endif From d186f405fdf4229d0e9a52ba71662404b06cc002 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:36 +0000 Subject: [PATCH 3/4] tcp: add RCU management to inet_bind_bucket Add RCU protection to inet_bind_bucket structure. - Add rcu_head field to the structure definition. - Use kfree_rcu() at destroy time, and remove inet_bind_bucket_destroy() first argument. - Use hlist_del_rcu() and hlist_add_head_rcu() methods. Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-4-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 4 ++-- net/ipv4/inet_connection_sock.c | 2 +- net/ipv4/inet_hashtables.c | 14 +++++++------- net/ipv4/inet_timewait_sock.c | 2 +- 4 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index da818fb0205f..1061c4f536a6 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -89,6 +89,7 @@ struct inet_bind_bucket { bool fast_ipv6_only; struct hlist_node node; struct hlist_head bhash2; + struct rcu_head rcu; }; struct inet_bind2_bucket { @@ -226,8 +227,7 @@ struct inet_bind_bucket * inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net, struct inet_bind_hashbucket *head, const unsigned short snum, int l3mdev); -void inet_bind_bucket_destroy(struct kmem_cache *cachep, - struct inet_bind_bucket *tb); +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb); bool inet_bind_bucket_match(const struct inet_bind_bucket *tb, const struct net *net, unsigned short port, diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index b4e514da22b6..e93c66034077 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -598,7 +598,7 @@ fail_unlock: if (bhash2_created) inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2); if (bhash_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); } if (head2_lock_acquired) spin_unlock(&head2->lock); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 46d39aa2199e..b737e13f8459 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, tb->fastreuse = 0; tb->fastreuseport = 0; INIT_HLIST_HEAD(&tb->bhash2); - hlist_add_head(&tb->node, &head->chain); + hlist_add_head_rcu(&tb->node, &head->chain); } return tb; } @@ -84,11 +84,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep, /* * Caller must hold hashbucket lock for this tb with local BH disabled */ -void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb) +void inet_bind_bucket_destroy(struct inet_bind_bucket *tb) { if (hlist_empty(&tb->bhash2)) { - __hlist_del(&tb->node); - kmem_cache_free(cachep, tb); + hlist_del_rcu(&tb->node); + kfree_rcu(tb, rcu); } } @@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk) } spin_unlock(&head2->lock); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); } @@ -285,7 +285,7 @@ bhash2_find: error: if (created_inet_bind_bucket) - inet_bind_bucket_destroy(table->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head2->lock); spin_unlock(&head->lock); return -ENOMEM; @@ -1162,7 +1162,7 @@ error: spin_unlock(&head2->lock); if (tb_created) - inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); spin_unlock(&head->lock); if (tw) diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 337390ba85b4..aded4bf1bc16 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw, tw->tw_tb = NULL; tw->tw_tb2 = NULL; inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2); - inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb); + inet_bind_bucket_destroy(tb); __sock_put((struct sock *)tw); } From 86c2bc293b8130aec9fa504e953531a84a6eb9a6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 2 Mar 2025 12:42:37 +0000 Subject: [PATCH 4/4] tcp: use RCU lookup in __inet_hash_connect() When __inet_hash_connect() has to try many 4-tuples before finding an available one, we see a high spinlock cost from the many spin_lock_bh(&head->lock) performed in its loop. This patch adds an RCU lookup to avoid the spinlock cost. check_established() gets a new @rcu_lookup argument. First reason is to not make any changes while head->lock is not held. Second reason is to not make this RCU lookup a second time after the spinlock has been acquired. Tested: Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server Before series: utime_start=0.288582 utime_end=1.548707 stime_start=20.637138 stime_end=2002.489845 num_transactions=484453 latency_min=0.156279245 latency_max=20.922042756 latency_mean=1.546521274 latency_stddev=3.936005194 num_samples=312537 throughput=47426.00 perf top on the client: 49.54% [kernel] [k] _raw_spin_lock 25.87% [kernel] [k] _raw_spin_lock_bh 5.97% [kernel] [k] queued_spin_lock_slowpath 5.67% [kernel] [k] __inet_hash_connect 3.53% [kernel] [k] __inet6_check_established 3.48% [kernel] [k] inet6_ehashfn 0.64% [kernel] [k] rcu_all_qs After this series: utime_start=0.271607 utime_end=3.847111 stime_start=18.407684 stime_end=1997.485557 num_transactions=1350742 latency_min=0.014131929 latency_max=17.895073144 latency_mean=0.505675853 # Nice reduction of latency metrics latency_stddev=2.125164772 num_samples=307884 throughput=139866.80 # 190 % increase perf top on client: 56.86% [kernel] [k] __inet6_check_established 17.96% [kernel] [k] __inet_hash_connect 13.88% [kernel] [k] inet6_ehashfn 2.52% [kernel] [k] rcu_all_qs 2.01% [kernel] [k] __cond_resched 0.41% [kernel] [k] _raw_spin_lock Signed-off-by: Eric Dumazet Reviewed-by: Jason Xing Tested-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250302124237.3913746-5-edumazet@google.com Signed-off-by: Jakub Kicinski --- include/net/inet_hashtables.h | 3 +- net/ipv4/inet_hashtables.c | 52 +++++++++++++++++++++++------------ net/ipv6/inet6_hashtables.c | 24 ++++++++-------- 3 files changed, 50 insertions(+), 29 deletions(-) diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 1061c4f536a6..f447d61d9598 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, struct sock *, __u16, - struct inet_timewait_sock **)); + struct inet_timewait_sock **, + bool rcu_lookup)); int inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk); diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index b737e13f8459..d1b5f45ee718 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established); /* called with local bh disabled */ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -556,17 +557,17 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; spinlock_t *lock; - rcu_read_lock(); - sk_nulls_for_each(sk2, node, &head->chain) { - if (sk2->sk_hash != hash || - !inet_match(net, sk2, acookie, ports, dif, sdif)) - continue; - if (sk2->sk_state == TCP_TIME_WAIT) - break; - rcu_read_unlock(); - return -EADDRNOTAVAIL; + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet_match(net, sk2, acookie, ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; } - rcu_read_unlock(); lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock); @@ -1007,7 +1008,8 @@ static u32 *table_perturb; int __inet_hash_connect(struct inet_timewait_death_row *death_row, struct sock *sk, u64 port_offset, int (*check_established)(struct inet_timewait_death_row *, - struct sock *, __u16, struct inet_timewait_sock **)) + struct sock *, __u16, struct inet_timewait_sock **, + bool rcu_lookup)) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_bind_hashbucket *head, *head2; @@ -1025,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (port) { local_bh_disable(); - ret = check_established(death_row, sk, port, NULL); + ret = check_established(death_row, sk, port, NULL, false); local_bh_enable(); return ret; } @@ -1061,6 +1063,21 @@ other_parity_scan: continue; head = &hinfo->bhash[inet_bhashfn(net, port, hinfo->bhash_size)]; + rcu_read_lock(); + hlist_for_each_entry_rcu(tb, &head->chain, node) { + if (!inet_bind_bucket_match(tb, net, port, l3mdev)) + continue; + if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) { + rcu_read_unlock(); + goto next_port; + } + if (!check_established(death_row, sk, port, &tw, true)) + break; + rcu_read_unlock(); + goto next_port; + } + rcu_read_unlock(); + spin_lock_bh(&head->lock); /* Does not bother with rcv_saddr checks, because @@ -1070,12 +1087,12 @@ other_parity_scan: if (inet_bind_bucket_match(tb, net, port, l3mdev)) { if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) - goto next_port; + goto next_port_unlock; WARN_ON(hlist_empty(&tb->bhash2)); if (!check_established(death_row, sk, - port, &tw)) + port, &tw, false)) goto ok; - goto next_port; + goto next_port_unlock; } } @@ -1089,8 +1106,9 @@ other_parity_scan: tb->fastreuse = -1; tb->fastreuseport = -1; goto ok; -next_port: +next_port_unlock: spin_unlock_bh(&head->lock); +next_port: cond_resched(); } diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 3604a5cae5d2..9be315496459 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup); static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) + struct inet_timewait_sock **twp, + bool rcu_lookup) { struct inet_hashinfo *hinfo = death_row->hashinfo; struct inet_sock *inet = inet_sk(sk); @@ -281,17 +282,18 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row, struct sock *sk2; spinlock_t *lock; - rcu_read_lock(); - sk_nulls_for_each(sk2, node, &head->chain) { - if (sk2->sk_hash != hash || - !inet6_match(net, sk2, saddr, daddr, ports, dif, sdif)) - continue; - if (sk2->sk_state == TCP_TIME_WAIT) - break; - rcu_read_unlock(); - return -EADDRNOTAVAIL; + if (rcu_lookup) { + sk_nulls_for_each(sk2, node, &head->chain) { + if (sk2->sk_hash != hash || + !inet6_match(net, sk2, saddr, daddr, + ports, dif, sdif)) + continue; + if (sk2->sk_state == TCP_TIME_WAIT) + break; + return -EADDRNOTAVAIL; + } + return 0; } - rcu_read_unlock(); lock = inet_ehash_lockp(hinfo, hash); spin_lock(lock);