Skip to content

Commit e118cdc

Browse files
Paolo Abenikuba-moo
authored andcommitted
mptcp: rcvbuf auto-tuning improvement
Apply to the MPTCP auto-tuning the same improvements introduced for the TCP protocol by the merge commit 2da35e4 ("Merge branch 'tcp-receive-side-improvements'"). The main difference is that TCP subflow and the main MPTCP socket need to account separately for OoO: MPTCP does not care for TCP-level OoO and vice versa, as a consequence do not reflect MPTCP-level rcvbuf increase due to OoO packets at the subflow level. This refeactor additionally allow dropping the msk receive buffer update at receive time, as the latter only intended to cope with subflow receive buffer increase due to OoO packets. Closes: multipath-tcp/mptcp_net-next#487 Closes: multipath-tcp/mptcp_net-next#559 Reviewed-by: Geliang Tang <geliang@kernel.org> Tested-by: Geliang Tang <geliang@kernel.org> Signed-off-by: Paolo Abeni <pabeni@redhat.com> Reviewed-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Signed-off-by: Matthieu Baerts (NGI0) <matttbe@kernel.org> Link: https://patch.msgid.link/20250927-net-next-mptcp-rcv-path-imp-v1-3-5da266aa9c1a@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent a755677 commit e118cdc

File tree

2 files changed

+49
-52
lines changed

2 files changed

+49
-52
lines changed

net/mptcp/protocol.c

Lines changed: 47 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,35 @@ static bool mptcp_ooo_try_coalesce(struct mptcp_sock *msk, struct sk_buff *to,
179179
return mptcp_try_coalesce((struct sock *)msk, to, from);
180180
}
181181

182+
/* "inspired" by tcp_rcvbuf_grow(), main difference:
183+
* - mptcp does not maintain a msk-level window clamp
184+
* - returns true when the receive buffer is actually updated
185+
*/
186+
static bool mptcp_rcvbuf_grow(struct sock *sk)
187+
{
188+
struct mptcp_sock *msk = mptcp_sk(sk);
189+
const struct net *net = sock_net(sk);
190+
int rcvwin, rcvbuf, cap;
191+
192+
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
193+
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
194+
return false;
195+
196+
rcvwin = msk->rcvq_space.space << 1;
197+
198+
if (!RB_EMPTY_ROOT(&msk->out_of_order_queue))
199+
rcvwin += MPTCP_SKB_CB(msk->ooo_last_skb)->end_seq - msk->ack_seq;
200+
201+
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
202+
203+
rcvbuf = min_t(u32, mptcp_space_from_win(sk, rcvwin), cap);
204+
if (rcvbuf > sk->sk_rcvbuf) {
205+
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
206+
return true;
207+
}
208+
return false;
209+
}
210+
182211
/* "inspired" by tcp_data_queue_ofo(), main differences:
183212
* - use mptcp seqs
184213
* - don't cope with sacks
@@ -292,6 +321,9 @@ static void mptcp_data_queue_ofo(struct mptcp_sock *msk, struct sk_buff *skb)
292321
end:
293322
skb_condense(skb);
294323
skb_set_owner_r(skb, sk);
324+
/* do not grow rcvbuf for not-yet-accepted or orphaned sockets. */
325+
if (sk->sk_socket)
326+
mptcp_rcvbuf_grow(sk);
295327
}
296328

297329
static bool __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
@@ -784,18 +816,10 @@ static bool move_skbs_to_msk(struct mptcp_sock *msk, struct sock *ssk)
784816
return moved;
785817
}
786818

787-
static void __mptcp_rcvbuf_update(struct sock *sk, struct sock *ssk)
788-
{
789-
if (unlikely(ssk->sk_rcvbuf > sk->sk_rcvbuf))
790-
WRITE_ONCE(sk->sk_rcvbuf, ssk->sk_rcvbuf);
791-
}
792-
793819
static void __mptcp_data_ready(struct sock *sk, struct sock *ssk)
794820
{
795821
struct mptcp_sock *msk = mptcp_sk(sk);
796822

797-
__mptcp_rcvbuf_update(sk, ssk);
798-
799823
/* Wake-up the reader only for in-sequence data */
800824
if (move_skbs_to_msk(msk, ssk) && mptcp_epollin_ready(sk))
801825
sk->sk_data_ready(sk);
@@ -2014,48 +2038,26 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied)
20142038
if (msk->rcvq_space.copied <= msk->rcvq_space.space)
20152039
goto new_measure;
20162040

2017-
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
2018-
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
2019-
u64 rcvwin, grow;
2020-
int rcvbuf;
2021-
2022-
rcvwin = ((u64)msk->rcvq_space.copied << 1) + 16 * advmss;
2023-
2024-
grow = rcvwin * (msk->rcvq_space.copied - msk->rcvq_space.space);
2025-
2026-
do_div(grow, msk->rcvq_space.space);
2027-
rcvwin += (grow << 1);
2028-
2029-
rcvbuf = min_t(u64, mptcp_space_from_win(sk, rcvwin),
2030-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
2031-
2032-
if (rcvbuf > sk->sk_rcvbuf) {
2033-
u32 window_clamp;
2034-
2035-
window_clamp = mptcp_win_from_space(sk, rcvbuf);
2036-
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
2041+
msk->rcvq_space.space = msk->rcvq_space.copied;
2042+
if (mptcp_rcvbuf_grow(sk)) {
20372043

2038-
/* Make subflows follow along. If we do not do this, we
2039-
* get drops at subflow level if skbs can't be moved to
2040-
* the mptcp rx queue fast enough (announced rcv_win can
2041-
* exceed ssk->sk_rcvbuf).
2042-
*/
2043-
mptcp_for_each_subflow(msk, subflow) {
2044-
struct sock *ssk;
2045-
bool slow;
2044+
/* Make subflows follow along. If we do not do this, we
2045+
* get drops at subflow level if skbs can't be moved to
2046+
* the mptcp rx queue fast enough (announced rcv_win can
2047+
* exceed ssk->sk_rcvbuf).
2048+
*/
2049+
mptcp_for_each_subflow(msk, subflow) {
2050+
struct sock *ssk;
2051+
bool slow;
20462052

2047-
ssk = mptcp_subflow_tcp_sock(subflow);
2048-
slow = lock_sock_fast(ssk);
2049-
WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf);
2050-
WRITE_ONCE(tcp_sk(ssk)->window_clamp, window_clamp);
2051-
if (tcp_can_send_ack(ssk))
2052-
tcp_cleanup_rbuf(ssk, 1);
2053-
unlock_sock_fast(ssk, slow);
2054-
}
2053+
ssk = mptcp_subflow_tcp_sock(subflow);
2054+
slow = lock_sock_fast(ssk);
2055+
tcp_sk(ssk)->rcvq_space.space = msk->rcvq_space.copied;
2056+
tcp_rcvbuf_grow(ssk);
2057+
unlock_sock_fast(ssk, slow);
20552058
}
20562059
}
20572060

2058-
msk->rcvq_space.space = msk->rcvq_space.copied;
20592061
new_measure:
20602062
msk->rcvq_space.copied = 0;
20612063
msk->rcvq_space.time = mstamp;
@@ -2084,11 +2086,6 @@ static bool __mptcp_move_skbs(struct sock *sk)
20842086
if (list_empty(&msk->conn_list))
20852087
return false;
20862088

2087-
/* verify we can move any data from the subflow, eventually updating */
2088-
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
2089-
mptcp_for_each_subflow(msk, subflow)
2090-
__mptcp_rcvbuf_update(sk, subflow->tcp_sock);
2091-
20922089
subflow = list_first_entry(&msk->conn_list,
20932090
struct mptcp_subflow_context, node);
20942091
for (;;) {

net/mptcp/protocol.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -341,8 +341,8 @@ struct mptcp_sock {
341341
struct mptcp_pm_data pm;
342342
struct mptcp_sched_ops *sched;
343343
struct {
344-
u32 space; /* bytes copied in last measurement window */
345-
u32 copied; /* bytes copied in this measurement window */
344+
int space; /* bytes copied in last measurement window */
345+
int copied; /* bytes copied in this measurement window */
346346
u64 time; /* start time of measurement window */
347347
u64 rtt_us; /* last maximum rtt of subflows */
348348
} rcvq_space;

0 commit comments

Comments
 (0)