Merge branch 'tcp-scale-connect-under-pressure'
Eric Dumazet says: ==================== tcp: scale connect() under pressure Adoption of bhash2 in linux-6.1 made some operations almost twice more expensive, because of additional locks. This series adds RCU in __inet_hash_connect() to help the case where many attempts need to be made before finding an available 4-tuple. This brings a ~200 % improvement in this experiment: Server: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog Client: ulimit -n 40000; neper/tcp_crr -T 200 -F 30000 -6 --nolog -c -H server Before series: utime_start=0.288582 utime_end=1.548707 stime_start=20.637138 stime_end=2002.489845 num_transactions=484453 latency_min=0.156279245 latency_max=20.922042756 latency_mean=1.546521274 latency_stddev=3.936005194 num_samples=312537 throughput=47426.00 perf top on the client: 49.54% [kernel] [k] _raw_spin_lock 25.87% [kernel] [k] _raw_spin_lock_bh 5.97% [kernel] [k] queued_spin_lock_slowpath 5.67% [kernel] [k] __inet_hash_connect 3.53% [kernel] [k] __inet6_check_established 3.48% [kernel] [k] inet6_ehashfn 0.64% [kernel] [k] rcu_all_qs After this series: utime_start=0.271607 utime_end=3.847111 stime_start=18.407684 stime_end=1997.485557 num_transactions=1350742 latency_min=0.014131929 latency_max=17.895073144 latency_mean=0.505675853 # Nice reduction of latency metrics latency_stddev=2.125164772 num_samples=307884 throughput=139866.80 # 194 % increase perf top on client: 56.86% [kernel] [k] __inet6_check_established 17.96% [kernel] [k] __inet_hash_connect 13.88% [kernel] [k] inet6_ehashfn 2.52% [kernel] [k] rcu_all_qs 2.01% [kernel] [k] __cond_resched 0.41% [kernel] [k] _raw_spin_lock ==================== Link: https://patch.msgid.link/20250302124237.3913746-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
@@ -89,6 +89,7 @@ struct inet_bind_bucket {
|
||||
bool fast_ipv6_only;
|
||||
struct hlist_node node;
|
||||
struct hlist_head bhash2;
|
||||
struct rcu_head rcu;
|
||||
};
|
||||
|
||||
struct inet_bind2_bucket {
|
||||
@@ -226,8 +227,7 @@ struct inet_bind_bucket *
|
||||
inet_bind_bucket_create(struct kmem_cache *cachep, struct net *net,
|
||||
struct inet_bind_hashbucket *head,
|
||||
const unsigned short snum, int l3mdev);
|
||||
void inet_bind_bucket_destroy(struct kmem_cache *cachep,
|
||||
struct inet_bind_bucket *tb);
|
||||
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb);
|
||||
|
||||
bool inet_bind_bucket_match(const struct inet_bind_bucket *tb,
|
||||
const struct net *net, unsigned short port,
|
||||
@@ -529,7 +529,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, u64 port_offset,
|
||||
int (*check_established)(struct inet_timewait_death_row *,
|
||||
struct sock *, __u16,
|
||||
struct inet_timewait_sock **));
|
||||
struct inet_timewait_sock **,
|
||||
bool rcu_lookup));
|
||||
|
||||
int inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk);
|
||||
|
||||
@@ -157,12 +157,10 @@ static bool inet_use_bhash2_on_bind(const struct sock *sk)
|
||||
{
|
||||
#if IS_ENABLED(CONFIG_IPV6)
|
||||
if (sk->sk_family == AF_INET6) {
|
||||
int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
|
||||
|
||||
if (addr_type == IPV6_ADDR_ANY)
|
||||
if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
|
||||
return false;
|
||||
|
||||
if (addr_type != IPV6_ADDR_MAPPED)
|
||||
if (!ipv6_addr_v4mapped(&sk->sk_v6_rcv_saddr))
|
||||
return true;
|
||||
}
|
||||
#endif
|
||||
@@ -600,7 +598,7 @@ fail_unlock:
|
||||
if (bhash2_created)
|
||||
inet_bind2_bucket_destroy(hinfo->bind2_bucket_cachep, tb2);
|
||||
if (bhash_created)
|
||||
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
|
||||
inet_bind_bucket_destroy(tb);
|
||||
}
|
||||
if (head2_lock_acquired)
|
||||
spin_unlock(&head2->lock);
|
||||
|
||||
+48
-17
@@ -76,7 +76,7 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
|
||||
tb->fastreuse = 0;
|
||||
tb->fastreuseport = 0;
|
||||
INIT_HLIST_HEAD(&tb->bhash2);
|
||||
hlist_add_head(&tb->node, &head->chain);
|
||||
hlist_add_head_rcu(&tb->node, &head->chain);
|
||||
}
|
||||
return tb;
|
||||
}
|
||||
@@ -84,11 +84,11 @@ struct inet_bind_bucket *inet_bind_bucket_create(struct kmem_cache *cachep,
|
||||
/*
|
||||
* Caller must hold hashbucket lock for this tb with local BH disabled
|
||||
*/
|
||||
void inet_bind_bucket_destroy(struct kmem_cache *cachep, struct inet_bind_bucket *tb)
|
||||
void inet_bind_bucket_destroy(struct inet_bind_bucket *tb)
|
||||
{
|
||||
if (hlist_empty(&tb->bhash2)) {
|
||||
__hlist_del(&tb->node);
|
||||
kmem_cache_free(cachep, tb);
|
||||
hlist_del_rcu(&tb->node);
|
||||
kfree_rcu(tb, rcu);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,7 +201,7 @@ static void __inet_put_port(struct sock *sk)
|
||||
}
|
||||
spin_unlock(&head2->lock);
|
||||
|
||||
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
||||
inet_bind_bucket_destroy(tb);
|
||||
spin_unlock(&head->lock);
|
||||
}
|
||||
|
||||
@@ -285,7 +285,7 @@ bhash2_find:
|
||||
|
||||
error:
|
||||
if (created_inet_bind_bucket)
|
||||
inet_bind_bucket_destroy(table->bind_bucket_cachep, tb);
|
||||
inet_bind_bucket_destroy(tb);
|
||||
spin_unlock(&head2->lock);
|
||||
spin_unlock(&head->lock);
|
||||
return -ENOMEM;
|
||||
@@ -537,7 +537,8 @@ EXPORT_SYMBOL_GPL(__inet_lookup_established);
|
||||
/* called with local bh disabled */
|
||||
static int __inet_check_established(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, __u16 lport,
|
||||
struct inet_timewait_sock **twp)
|
||||
struct inet_timewait_sock **twp,
|
||||
bool rcu_lookup)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
@@ -551,11 +552,24 @@ static int __inet_check_established(struct inet_timewait_death_row *death_row,
|
||||
unsigned int hash = inet_ehashfn(net, daddr, lport,
|
||||
saddr, inet->inet_dport);
|
||||
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
|
||||
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
|
||||
struct sock *sk2;
|
||||
const struct hlist_nulls_node *node;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
const struct hlist_nulls_node *node;
|
||||
struct sock *sk2;
|
||||
spinlock_t *lock;
|
||||
|
||||
if (rcu_lookup) {
|
||||
sk_nulls_for_each(sk2, node, &head->chain) {
|
||||
if (sk2->sk_hash != hash ||
|
||||
!inet_match(net, sk2, acookie, ports, dif, sdif))
|
||||
continue;
|
||||
if (sk2->sk_state == TCP_TIME_WAIT)
|
||||
break;
|
||||
return -EADDRNOTAVAIL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
lock = inet_ehash_lockp(hinfo, hash);
|
||||
spin_lock(lock);
|
||||
|
||||
sk_nulls_for_each(sk2, node, &head->chain) {
|
||||
@@ -994,7 +1008,8 @@ static u32 *table_perturb;
|
||||
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, u64 port_offset,
|
||||
int (*check_established)(struct inet_timewait_death_row *,
|
||||
struct sock *, __u16, struct inet_timewait_sock **))
|
||||
struct sock *, __u16, struct inet_timewait_sock **,
|
||||
bool rcu_lookup))
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
struct inet_bind_hashbucket *head, *head2;
|
||||
@@ -1012,7 +1027,7 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
|
||||
|
||||
if (port) {
|
||||
local_bh_disable();
|
||||
ret = check_established(death_row, sk, port, NULL);
|
||||
ret = check_established(death_row, sk, port, NULL, false);
|
||||
local_bh_enable();
|
||||
return ret;
|
||||
}
|
||||
@@ -1048,6 +1063,21 @@ other_parity_scan:
|
||||
continue;
|
||||
head = &hinfo->bhash[inet_bhashfn(net, port,
|
||||
hinfo->bhash_size)];
|
||||
rcu_read_lock();
|
||||
hlist_for_each_entry_rcu(tb, &head->chain, node) {
|
||||
if (!inet_bind_bucket_match(tb, net, port, l3mdev))
|
||||
continue;
|
||||
if (tb->fastreuse >= 0 || tb->fastreuseport >= 0) {
|
||||
rcu_read_unlock();
|
||||
goto next_port;
|
||||
}
|
||||
if (!check_established(death_row, sk, port, &tw, true))
|
||||
break;
|
||||
rcu_read_unlock();
|
||||
goto next_port;
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
spin_lock_bh(&head->lock);
|
||||
|
||||
/* Does not bother with rcv_saddr checks, because
|
||||
@@ -1057,12 +1087,12 @@ other_parity_scan:
|
||||
if (inet_bind_bucket_match(tb, net, port, l3mdev)) {
|
||||
if (tb->fastreuse >= 0 ||
|
||||
tb->fastreuseport >= 0)
|
||||
goto next_port;
|
||||
goto next_port_unlock;
|
||||
WARN_ON(hlist_empty(&tb->bhash2));
|
||||
if (!check_established(death_row, sk,
|
||||
port, &tw))
|
||||
port, &tw, false))
|
||||
goto ok;
|
||||
goto next_port;
|
||||
goto next_port_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1076,8 +1106,9 @@ other_parity_scan:
|
||||
tb->fastreuse = -1;
|
||||
tb->fastreuseport = -1;
|
||||
goto ok;
|
||||
next_port:
|
||||
next_port_unlock:
|
||||
spin_unlock_bh(&head->lock);
|
||||
next_port:
|
||||
cond_resched();
|
||||
}
|
||||
|
||||
@@ -1149,7 +1180,7 @@ error:
|
||||
|
||||
spin_unlock(&head2->lock);
|
||||
if (tb_created)
|
||||
inet_bind_bucket_destroy(hinfo->bind_bucket_cachep, tb);
|
||||
inet_bind_bucket_destroy(tb);
|
||||
spin_unlock(&head->lock);
|
||||
|
||||
if (tw)
|
||||
|
||||
@@ -39,7 +39,7 @@ void inet_twsk_bind_unhash(struct inet_timewait_sock *tw,
|
||||
tw->tw_tb = NULL;
|
||||
tw->tw_tb2 = NULL;
|
||||
inet_bind2_bucket_destroy(hashinfo->bind2_bucket_cachep, tb2);
|
||||
inet_bind_bucket_destroy(hashinfo->bind_bucket_cachep, tb);
|
||||
inet_bind_bucket_destroy(tb);
|
||||
|
||||
__sock_put((struct sock *)tw);
|
||||
}
|
||||
|
||||
@@ -263,7 +263,8 @@ EXPORT_SYMBOL_GPL(inet6_lookup);
|
||||
|
||||
static int __inet6_check_established(struct inet_timewait_death_row *death_row,
|
||||
struct sock *sk, const __u16 lport,
|
||||
struct inet_timewait_sock **twp)
|
||||
struct inet_timewait_sock **twp,
|
||||
bool rcu_lookup)
|
||||
{
|
||||
struct inet_hashinfo *hinfo = death_row->hashinfo;
|
||||
struct inet_sock *inet = inet_sk(sk);
|
||||
@@ -276,11 +277,25 @@ static int __inet6_check_established(struct inet_timewait_death_row *death_row,
|
||||
const unsigned int hash = inet6_ehashfn(net, daddr, lport, saddr,
|
||||
inet->inet_dport);
|
||||
struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash);
|
||||
spinlock_t *lock = inet_ehash_lockp(hinfo, hash);
|
||||
struct sock *sk2;
|
||||
const struct hlist_nulls_node *node;
|
||||
struct inet_timewait_sock *tw = NULL;
|
||||
const struct hlist_nulls_node *node;
|
||||
struct sock *sk2;
|
||||
spinlock_t *lock;
|
||||
|
||||
if (rcu_lookup) {
|
||||
sk_nulls_for_each(sk2, node, &head->chain) {
|
||||
if (sk2->sk_hash != hash ||
|
||||
!inet6_match(net, sk2, saddr, daddr,
|
||||
ports, dif, sdif))
|
||||
continue;
|
||||
if (sk2->sk_state == TCP_TIME_WAIT)
|
||||
break;
|
||||
return -EADDRNOTAVAIL;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
lock = inet_ehash_lockp(hinfo, hash);
|
||||
spin_lock(lock);
|
||||
|
||||
sk_nulls_for_each(sk2, node, &head->chain) {
|
||||
|
||||
Reference in New Issue
Block a user