Skip to content

Commit 8005184

Browse files
Paolo Abenikuba-moo
authored andcommitted
mptcp: refactor sndbuf auto-tuning
The MPTCP protocol account for the data enqueued on all the subflows to the main socket send buffer, while the send buffer auto-tuning algorithm set the main socket send buffer size as the max size among the subflows. That causes bad performances when at least one subflow is sndbuf limited, e.g. due to very high latency, as the MPTCP scheduler can't even fill such buffer. Change the send-buffer auto-tuning algorithm to compute the main socket send buffer size as the sum of all the subflows buffer size. Reviewed-by: Mat Martineau <martineau@kernel.org> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: Mat Martineau <martineau@kernel.org> Link: https://lore.kernel.org/r/20231023-send-net-next-20231023-2-v1-9-9dc60939d371@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 9fdc779 commit 8005184

File tree

4 files changed

+70
-10
lines changed

4 files changed

+70
-10
lines changed

net/mptcp/protocol.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -890,6 +890,7 @@ static bool __mptcp_finish_join(struct mptcp_sock *msk, struct sock *ssk)
890890
mptcp_sockopt_sync_locked(msk, ssk);
891891
mptcp_subflow_joined(msk, ssk);
892892
mptcp_stop_tout_timer(sk);
893+
__mptcp_propagate_sndbuf(sk, ssk);
893894
return true;
894895
}
895896

@@ -1076,15 +1077,16 @@ static void mptcp_enter_memory_pressure(struct sock *sk)
10761077
struct mptcp_sock *msk = mptcp_sk(sk);
10771078
bool first = true;
10781079

1079-
sk_stream_moderate_sndbuf(sk);
10801080
mptcp_for_each_subflow(msk, subflow) {
10811081
struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
10821082

10831083
if (first)
10841084
tcp_enter_memory_pressure(ssk);
10851085
sk_stream_moderate_sndbuf(ssk);
1086+
10861087
first = false;
10871088
}
1089+
__mptcp_sync_sndbuf(sk);
10881090
}
10891091

10901092
/* ensure we get enough memory for the frag hdr, beyond some minimal amount of
@@ -2458,6 +2460,7 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
24582460
WRITE_ONCE(msk->first, NULL);
24592461

24602462
out:
2463+
__mptcp_sync_sndbuf(sk);
24612464
if (need_push)
24622465
__mptcp_push_pending(sk, 0);
24632466

@@ -3224,7 +3227,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
32243227
* uses the correct data
32253228
*/
32263229
mptcp_copy_inaddrs(nsk, ssk);
3227-
mptcp_propagate_sndbuf(nsk, ssk);
3230+
__mptcp_propagate_sndbuf(nsk, ssk);
32283231

32293232
mptcp_rcv_space_init(msk, ssk);
32303233
bh_unlock_sock(nsk);
@@ -3402,6 +3405,8 @@ static void mptcp_release_cb(struct sock *sk)
34023405
__mptcp_set_connected(sk);
34033406
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
34043407
__mptcp_error_report(sk);
3408+
if (__test_and_clear_bit(MPTCP_SYNC_SNDBUF, &msk->cb_flags))
3409+
__mptcp_sync_sndbuf(sk);
34053410
}
34063411

34073412
__mptcp_update_rmem(sk);
@@ -3446,6 +3451,14 @@ void mptcp_subflow_process_delegated(struct sock *ssk, long status)
34463451
__set_bit(MPTCP_PUSH_PENDING, &mptcp_sk(sk)->cb_flags);
34473452
mptcp_data_unlock(sk);
34483453
}
3454+
if (status & BIT(MPTCP_DELEGATE_SNDBUF)) {
3455+
mptcp_data_lock(sk);
3456+
if (!sock_owned_by_user(sk))
3457+
__mptcp_sync_sndbuf(sk);
3458+
else
3459+
__set_bit(MPTCP_SYNC_SNDBUF, &mptcp_sk(sk)->cb_flags);
3460+
mptcp_data_unlock(sk);
3461+
}
34493462
if (status & BIT(MPTCP_DELEGATE_ACK))
34503463
schedule_3rdack_retransmission(ssk);
34513464
}
@@ -3530,6 +3543,7 @@ bool mptcp_finish_join(struct sock *ssk)
35303543
/* active subflow, already present inside the conn_list */
35313544
if (!list_empty(&subflow->node)) {
35323545
mptcp_subflow_joined(msk, ssk);
3546+
mptcp_propagate_sndbuf(parent, ssk);
35333547
return true;
35343548
}
35353549

net/mptcp/protocol.h

Lines changed: 49 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -125,6 +125,7 @@
125125
#define MPTCP_RETRANSMIT 4
126126
#define MPTCP_FLUSH_JOIN_LIST 5
127127
#define MPTCP_CONNECTED 6
128+
#define MPTCP_SYNC_SNDBUF 7
128129

129130
struct mptcp_skb_cb {
130131
u64 map_seq;
@@ -445,6 +446,7 @@ DECLARE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
445446
#define MPTCP_DELEGATE_SCHEDULED 0
446447
#define MPTCP_DELEGATE_SEND 1
447448
#define MPTCP_DELEGATE_ACK 2
449+
#define MPTCP_DELEGATE_SNDBUF 3
448450

449451
#define MPTCP_DELEGATE_ACTIONS_MASK (~BIT(MPTCP_DELEGATE_SCHEDULED))
450452
/* MPTCP subflow context */
@@ -518,6 +520,9 @@ struct mptcp_subflow_context {
518520

519521
u32 setsockopt_seq;
520522
u32 stale_rcv_tstamp;
523+
int cached_sndbuf; /* sndbuf size when last synced with the msk sndbuf,
524+
* protected by the msk socket lock
525+
*/
521526

522527
struct sock *tcp_sock; /* tcp sk backpointer */
523528
struct sock *conn; /* parent mptcp_sock */
@@ -780,13 +785,52 @@ static inline bool mptcp_data_fin_enabled(const struct mptcp_sock *msk)
780785
READ_ONCE(msk->write_seq) == READ_ONCE(msk->snd_nxt);
781786
}
782787

783-
static inline bool mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
788+
static inline void __mptcp_sync_sndbuf(struct sock *sk)
784789
{
785-
if ((sk->sk_userlocks & SOCK_SNDBUF_LOCK) || ssk->sk_sndbuf <= READ_ONCE(sk->sk_sndbuf))
786-
return false;
790+
struct mptcp_subflow_context *subflow;
791+
int ssk_sndbuf, new_sndbuf;
792+
793+
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
794+
return;
795+
796+
new_sndbuf = sock_net(sk)->ipv4.sysctl_tcp_wmem[0];
797+
mptcp_for_each_subflow(mptcp_sk(sk), subflow) {
798+
ssk_sndbuf = READ_ONCE(mptcp_subflow_tcp_sock(subflow)->sk_sndbuf);
799+
800+
subflow->cached_sndbuf = ssk_sndbuf;
801+
new_sndbuf += ssk_sndbuf;
802+
}
803+
804+
/* the msk max wmem limit is <nr_subflows> * tcp wmem[2] */
805+
WRITE_ONCE(sk->sk_sndbuf, new_sndbuf);
806+
}
807+
808+
/* The called held both the msk socket and the subflow socket locks,
809+
* possibly under BH
810+
*/
811+
static inline void __mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
812+
{
813+
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
814+
815+
if (READ_ONCE(ssk->sk_sndbuf) != subflow->cached_sndbuf)
816+
__mptcp_sync_sndbuf(sk);
817+
}
818+
819+
/* the caller held only the subflow socket lock, either in process or
820+
* BH context. Additionally this can be called under the msk data lock,
821+
* so we can't acquire such lock here: let the delegate action acquires
822+
* the needed locks in suitable order.
823+
*/
824+
static inline void mptcp_propagate_sndbuf(struct sock *sk, struct sock *ssk)
825+
{
826+
struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
827+
828+
if (likely(READ_ONCE(ssk->sk_sndbuf) == subflow->cached_sndbuf))
829+
return;
787830

788-
WRITE_ONCE(sk->sk_sndbuf, ssk->sk_sndbuf);
789-
return true;
831+
local_bh_disable();
832+
mptcp_subflow_delegate(subflow, MPTCP_DELEGATE_SNDBUF);
833+
local_bh_enable();
790834
}
791835

792836
static inline void mptcp_write_space(struct sock *sk)

net/mptcp/sockopt.c

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ static void mptcp_sol_socket_sync_intval(struct mptcp_sock *msk, int optname, in
9595
case SO_SNDBUFFORCE:
9696
ssk->sk_userlocks |= SOCK_SNDBUF_LOCK;
9797
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
98+
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
9899
break;
99100
case SO_RCVBUF:
100101
case SO_RCVBUFFORCE:
@@ -1415,8 +1416,10 @@ static void sync_socket_options(struct mptcp_sock *msk, struct sock *ssk)
14151416

14161417
if (sk->sk_userlocks & tx_rx_locks) {
14171418
ssk->sk_userlocks |= sk->sk_userlocks & tx_rx_locks;
1418-
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK)
1419+
if (sk->sk_userlocks & SOCK_SNDBUF_LOCK) {
14191420
WRITE_ONCE(ssk->sk_sndbuf, sk->sk_sndbuf);
1421+
mptcp_subflow_ctx(ssk)->cached_sndbuf = sk->sk_sndbuf;
1422+
}
14201423
if (sk->sk_userlocks & SOCK_RCVBUF_LOCK)
14211424
WRITE_ONCE(ssk->sk_rcvbuf, sk->sk_rcvbuf);
14221425
}

net/mptcp/subflow.c

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,7 @@ static bool subflow_use_different_dport(struct mptcp_sock *msk, const struct soc
421421

422422
void __mptcp_set_connected(struct sock *sk)
423423
{
424+
__mptcp_propagate_sndbuf(sk, mptcp_sk(sk)->first);
424425
if (sk->sk_state == TCP_SYN_SENT) {
425426
inet_sk_state_store(sk, TCP_ESTABLISHED);
426427
sk->sk_state_change(sk);
@@ -472,7 +473,6 @@ static void subflow_finish_connect(struct sock *sk, const struct sk_buff *skb)
472473
return;
473474

474475
msk = mptcp_sk(parent);
475-
mptcp_propagate_sndbuf(parent, sk);
476476
subflow->rel_write_seq = 1;
477477
subflow->conn_finished = 1;
478478
subflow->ssn_offset = TCP_SKB_CB(skb)->seq;
@@ -1736,7 +1736,6 @@ static void subflow_state_change(struct sock *sk)
17361736

17371737
msk = mptcp_sk(parent);
17381738
if (subflow_simultaneous_connect(sk)) {
1739-
mptcp_propagate_sndbuf(parent, sk);
17401739
mptcp_do_fallback(sk);
17411740
mptcp_rcv_space_init(msk, sk);
17421741
pr_fallback(msk);

0 commit comments

Comments
 (0)