Merge branch 'mptcp-fix-fallback-related-races'
Matthieu Baerts says: ==================== mptcp: fix fallback-related races This series contains 3 fixes somewhat related to various races we have while handling fallback. The root cause of the issues addressed here is that the check for "we can fallback to tcp now" and the related action are not atomic. That also applies to fallback due to MP_FAIL -- where the window race is even wider. Address the issue introducing an additional spinlock to bundle together all the relevant events, as per patch 1 and 2. These fixes can be backported up to v5.19 and v5.15. Note that mptcp_disconnect() unconditionally clears the fallback status (zeroing msk->flags) but don't touch the `allows_infinite_fallback` flag. Such issue is addressed in patch 3, and can be backported up to v5.17. ==================== Link: https://patch.msgid.link/20250714-net-mptcp-fallback-races-v1-0-391aff963322@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
This commit is contained in:
+2
-1
@@ -978,8 +978,9 @@ static bool check_fully_established(struct mptcp_sock *msk, struct sock *ssk,
|
||||
if (subflow->mp_join)
|
||||
goto reset;
|
||||
subflow->mp_capable = 0;
|
||||
if (!mptcp_try_fallback(ssk))
|
||||
goto reset;
|
||||
pr_fallback(msk);
|
||||
mptcp_do_fallback(ssk);
|
||||
return false;
|
||||
}
|
||||
|
||||
|
||||
+7
-1
@@ -765,8 +765,14 @@ void mptcp_pm_mp_fail_received(struct sock *sk, u64 fail_seq)
|
||||
|
||||
pr_debug("fail_seq=%llu\n", fail_seq);
|
||||
|
||||
if (!READ_ONCE(msk->allow_infinite_fallback))
|
||||
/* After accepting the fail, we can't create any other subflows */
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (!msk->allow_infinite_fallback) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return;
|
||||
}
|
||||
msk->allow_subflows = false;
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
|
||||
if (!subflow->fail_tout) {
|
||||
pr_debug("send MP_FAIL response and infinite map\n");
|
||||
|
||||
+48
-8
@@ -560,10 +560,9 @@ static bool mptcp_check_data_fin(struct sock *sk)
|
||||
|
||||
static void mptcp_dss_corruption(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
if (READ_ONCE(msk->allow_infinite_fallback)) {
|
||||
if (mptcp_try_fallback(ssk)) {
|
||||
MPTCP_INC_STATS(sock_net(ssk),
|
||||
MPTCP_MIB_DSSCORRUPTIONFALLBACK);
|
||||
mptcp_do_fallback(ssk);
|
||||
} else {
|
||||
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_DSSCORRUPTIONRESET);
|
||||
mptcp_subflow_reset(ssk);
|
||||
@@ -792,7 +791,7 @@ void mptcp_data_ready(struct sock *sk, struct sock *ssk)
|
||||
static void mptcp_subflow_joined(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
mptcp_subflow_ctx(ssk)->map_seq = READ_ONCE(msk->ack_seq);
|
||||
WRITE_ONCE(msk->allow_infinite_fallback, false);
|
||||
msk->allow_infinite_fallback = false;
|
||||
mptcp_event(MPTCP_EVENT_SUB_ESTABLISHED, msk, ssk, GFP_ATOMIC);
|
||||
}
|
||||
|
||||
@@ -803,6 +802,14 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
|
||||
if (sk->sk_state != TCP_ESTABLISHED)
|
||||
return false;
|
||||
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (!msk->allow_subflows) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return false;
|
||||
}
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
|
||||
/* attach to msk socket only after we are sure we will deal with it
|
||||
* at close time
|
||||
*/
|
||||
@@ -811,7 +818,6 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
|
||||
|
||||
mptcp_subflow_ctx(ssk)->subflow_id = msk->subflow_id++;
|
||||
mptcp_sockopt_sync_locked(msk, ssk);
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
__mptcp_propagate_sndbuf(sk, ssk);
|
||||
return true;
|
||||
@@ -1136,10 +1142,14 @@ static void mptcp_update_infinite_map(struct mptcp_sock *msk,
|
||||
mpext->infinite_map = 1;
|
||||
mpext->data_len = 0;
|
||||
|
||||
if (!mptcp_try_fallback(ssk)) {
|
||||
mptcp_subflow_reset(ssk);
|
||||
return;
|
||||
}
|
||||
|
||||
MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_INFINITEMAPTX);
|
||||
mptcp_subflow_ctx(ssk)->send_infinite_map = 0;
|
||||
pr_fallback(msk);
|
||||
mptcp_do_fallback(ssk);
|
||||
}
|
||||
|
||||
#define MPTCP_MAX_GSO_SIZE (GSO_LEGACY_MAX_SIZE - (MAX_TCP_HEADER + 1))
|
||||
@@ -2543,9 +2553,9 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
|
||||
|
||||
static void __mptcp_retrans(struct sock *sk)
|
||||
{
|
||||
struct mptcp_sendmsg_info info = { .data_lock_held = true, };
|
||||
struct mptcp_sock *msk = mptcp_sk(sk);
|
||||
struct mptcp_subflow_context *subflow;
|
||||
struct mptcp_sendmsg_info info = {};
|
||||
struct mptcp_data_frag *dfrag;
|
||||
struct sock *ssk;
|
||||
int ret, err;
|
||||
@@ -2590,6 +2600,18 @@ static void __mptcp_retrans(struct sock *sk)
|
||||
info.sent = 0;
|
||||
info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
|
||||
dfrag->already_sent;
|
||||
|
||||
/*
|
||||
* make the whole retrans decision, xmit, disallow
|
||||
* fallback atomic
|
||||
*/
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (__mptcp_check_fallback(msk)) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
release_sock(ssk);
|
||||
return;
|
||||
}
|
||||
|
||||
while (info.sent < info.limit) {
|
||||
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
|
||||
if (ret <= 0)
|
||||
@@ -2603,8 +2625,9 @@ static void __mptcp_retrans(struct sock *sk)
|
||||
len = max(copied, len);
|
||||
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
|
||||
info.size_goal);
|
||||
WRITE_ONCE(msk->allow_infinite_fallback, false);
|
||||
msk->allow_infinite_fallback = false;
|
||||
}
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
|
||||
release_sock(ssk);
|
||||
}
|
||||
@@ -2730,7 +2753,8 @@ static void __mptcp_init_sock(struct sock *sk)
|
||||
WRITE_ONCE(msk->first, NULL);
|
||||
inet_csk(sk)->icsk_sync_mss = mptcp_sync_mss;
|
||||
WRITE_ONCE(msk->csum_enabled, mptcp_is_checksum_enabled(sock_net(sk)));
|
||||
WRITE_ONCE(msk->allow_infinite_fallback, true);
|
||||
msk->allow_infinite_fallback = true;
|
||||
msk->allow_subflows = true;
|
||||
msk->recovery = false;
|
||||
msk->subflow_id = 1;
|
||||
msk->last_data_sent = tcp_jiffies32;
|
||||
@@ -2738,6 +2762,7 @@ static void __mptcp_init_sock(struct sock *sk)
|
||||
msk->last_ack_recv = tcp_jiffies32;
|
||||
|
||||
mptcp_pm_data_init(msk);
|
||||
spin_lock_init(&msk->fallback_lock);
|
||||
|
||||
/* re-use the csk retrans timer for MPTCP-level retrans */
|
||||
timer_setup(&msk->sk.icsk_retransmit_timer, mptcp_retransmit_timer, 0);
|
||||
@@ -3117,7 +3142,16 @@ static int mptcp_disconnect(struct sock *sk, int flags)
|
||||
* subflow
|
||||
*/
|
||||
mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
|
||||
|
||||
/* The first subflow is already in TCP_CLOSE status, the following
|
||||
* can't overlap with a fallback anymore
|
||||
*/
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
msk->allow_subflows = true;
|
||||
msk->allow_infinite_fallback = true;
|
||||
WRITE_ONCE(msk->flags, 0);
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
|
||||
msk->cb_flags = 0;
|
||||
msk->recovery = false;
|
||||
WRITE_ONCE(msk->can_ack, false);
|
||||
@@ -3524,7 +3558,13 @@ bool mptcp_finish_join(struct sock *ssk)
|
||||
|
||||
/* active subflow, already present inside the conn_list */
|
||||
if (!list_empty(&subflow->node)) {
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (!msk->allow_subflows) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return false;
|
||||
}
|
||||
mptcp_subflow_joined(msk, ssk);
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
mptcp_propagate_sndbuf(parent, ssk);
|
||||
return true;
|
||||
}
|
||||
|
||||
+22
-7
@@ -346,10 +346,16 @@ struct mptcp_sock {
|
||||
u64 rtt_us; /* last maximum rtt of subflows */
|
||||
} rcvq_space;
|
||||
u8 scaling_ratio;
|
||||
bool allow_subflows;
|
||||
|
||||
u32 subflow_id;
|
||||
u32 setsockopt_seq;
|
||||
char ca_name[TCP_CA_NAME_MAX];
|
||||
|
||||
spinlock_t fallback_lock; /* protects fallback,
|
||||
* allow_infinite_fallback and
|
||||
* allow_join
|
||||
*/
|
||||
};
|
||||
|
||||
#define mptcp_data_lock(sk) spin_lock_bh(&(sk)->sk_lock.slock)
|
||||
@@ -1216,15 +1222,22 @@ static inline bool mptcp_check_fallback(const struct sock *sk)
|
||||
return __mptcp_check_fallback(msk);
|
||||
}
|
||||
|
||||
static inline void __mptcp_do_fallback(struct mptcp_sock *msk)
|
||||
static inline bool __mptcp_try_fallback(struct mptcp_sock *msk)
|
||||
{
|
||||
if (__mptcp_check_fallback(msk)) {
|
||||
pr_debug("TCP fallback already done (msk=%p)\n", msk);
|
||||
return;
|
||||
return true;
|
||||
}
|
||||
if (WARN_ON_ONCE(!READ_ONCE(msk->allow_infinite_fallback)))
|
||||
return;
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (!msk->allow_infinite_fallback) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return false;
|
||||
}
|
||||
|
||||
msk->allow_subflows = false;
|
||||
set_bit(MPTCP_FALLBACK_DONE, &msk->flags);
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk)
|
||||
@@ -1236,14 +1249,15 @@ static inline bool __mptcp_has_initial_subflow(const struct mptcp_sock *msk)
|
||||
TCPF_SYN_RECV | TCPF_LISTEN));
|
||||
}
|
||||
|
||||
static inline void mptcp_do_fallback(struct sock *ssk)
|
||||
static inline bool mptcp_try_fallback(struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
struct sock *sk = subflow->conn;
|
||||
struct mptcp_sock *msk;
|
||||
|
||||
msk = mptcp_sk(sk);
|
||||
__mptcp_do_fallback(msk);
|
||||
if (!__mptcp_try_fallback(msk))
|
||||
return false;
|
||||
if (READ_ONCE(msk->snd_data_fin_enable) && !(ssk->sk_shutdown & SEND_SHUTDOWN)) {
|
||||
gfp_t saved_allocation = ssk->sk_allocation;
|
||||
|
||||
@@ -1255,6 +1269,7 @@ static inline void mptcp_do_fallback(struct sock *ssk)
|
||||
tcp_shutdown(ssk, SEND_SHUTDOWN);
|
||||
ssk->sk_allocation = saved_allocation;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
#define pr_fallback(a) pr_debug("%s:fallback to TCP (msk=%p)\n", __func__, a)
|
||||
@@ -1264,7 +1279,7 @@ static inline void mptcp_subflow_early_fallback(struct mptcp_sock *msk,
|
||||
{
|
||||
pr_fallback(msk);
|
||||
subflow->request_mptcp = 0;
|
||||
__mptcp_do_fallback(msk);
|
||||
WARN_ON_ONCE(!__mptcp_try_fallback(msk));
|
||||
}
|
||||
|
||||
static inline bool mptcp_check_infinite_map(struct sk_buff *skb)
|
||||
|
||||
+19
-11
@@ -544,9 +544,11 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
|
||||
mptcp_get_options(skb, &mp_opt);
|
||||
if (subflow->request_mptcp) {
|
||||
if (!(mp_opt.suboptions & OPTION_MPTCP_MPC_SYNACK)) {
|
||||
if (!mptcp_try_fallback(sk))
|
||||
goto do_reset;
|
||||
|
||||
MPTCP_INC_STATS(sock_net(sk),
|
||||
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK);
|
||||
mptcp_do_fallback(sk);
|
||||
pr_fallback(msk);
|
||||
goto fallback;
|
||||
}
|
||||
@@ -1300,20 +1302,29 @@ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ss
|
||||
mptcp_schedule_work(sk);
|
||||
}
|
||||
|
||||
static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
|
||||
static bool mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
|
||||
{
|
||||
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
|
||||
unsigned long fail_tout;
|
||||
|
||||
/* we are really failing, prevent any later subflow join */
|
||||
spin_lock_bh(&msk->fallback_lock);
|
||||
if (!msk->allow_infinite_fallback) {
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
return false;
|
||||
}
|
||||
msk->allow_subflows = false;
|
||||
spin_unlock_bh(&msk->fallback_lock);
|
||||
|
||||
/* graceful failure can happen only on the MPC subflow */
|
||||
if (WARN_ON_ONCE(ssk != READ_ONCE(msk->first)))
|
||||
return;
|
||||
return false;
|
||||
|
||||
/* since the close timeout take precedence on the fail one,
|
||||
* no need to start the latter when the first is already set
|
||||
*/
|
||||
if (sock_flag((struct sock *)msk, SOCK_DEAD))
|
||||
return;
|
||||
return true;
|
||||
|
||||
/* we don't need extreme accuracy here, use a zero fail_tout as special
|
||||
* value meaning no fail timeout at all;
|
||||
@@ -1325,6 +1336,7 @@ static void mptcp_subflow_fail(struct mptcp_sock *msk, struct sock *ssk)
|
||||
tcp_send_ack(ssk);
|
||||
|
||||
mptcp_reset_tout_timer(msk, subflow->fail_tout);
|
||||
return true;
|
||||
}
|
||||
|
||||
static bool subflow_check_data_avail(struct sock *ssk)
|
||||
@@ -1385,17 +1397,16 @@ fallback:
|
||||
(subflow->mp_join || subflow->valid_csum_seen)) {
|
||||
subflow->send_mp_fail = 1;
|
||||
|
||||
if (!READ_ONCE(msk->allow_infinite_fallback)) {
|
||||
if (!mptcp_subflow_fail(msk, ssk)) {
|
||||
subflow->reset_transient = 0;
|
||||
subflow->reset_reason = MPTCP_RST_EMIDDLEBOX;
|
||||
goto reset;
|
||||
}
|
||||
mptcp_subflow_fail(msk, ssk);
|
||||
WRITE_ONCE(subflow->data_avail, true);
|
||||
return true;
|
||||
}
|
||||
|
||||
if (!READ_ONCE(msk->allow_infinite_fallback)) {
|
||||
if (!mptcp_try_fallback(ssk)) {
|
||||
/* fatal protocol error, close the socket.
|
||||
* subflow_error_report() will introduce the appropriate barriers
|
||||
*/
|
||||
@@ -1413,8 +1424,6 @@ reset:
|
||||
WRITE_ONCE(subflow->data_avail, false);
|
||||
return false;
|
||||
}
|
||||
|
||||
mptcp_do_fallback(ssk);
|
||||
}
|
||||
|
||||
skb = skb_peek(&ssk->sk_receive_queue);
|
||||
@@ -1679,7 +1688,6 @@ int __mptcp_subflow_connect(struct sock *sk, const struct mptcp_pm_local *local,
|
||||
/* discard the subflow socket */
|
||||
mptcp_sock_graft(ssk, sk->sk_socket);
|
||||
iput(SOCK_INODE(sf));
|
||||
WRITE_ONCE(msk->allow_infinite_fallback, false);
|
||||
mptcp_stop_tout_timer(sk);
|
||||
return 0;
|
||||
|
||||
@@ -1851,7 +1859,7 @@ static void subflow_state_change(struct sock *sk)
|
||||
|
||||
msk = mptcp_sk(parent);
|
||||
if (subflow_simultaneous_connect(sk)) {
|
||||
mptcp_do_fallback(sk);
|
||||
WARN_ON_ONCE(!mptcp_try_fallback(sk));
|
||||
pr_fallback(msk);
|
||||
subflow->conn_finished = 1;
|
||||
mptcp_propagate_state(parent, sk, subflow, NULL);
|
||||
|
||||
Reference in New Issue
Block a user