Skip to content

Commit 65c5287

Browse files
edumazetkuba-moo
authored andcommitted
tcp: fix sk_rcvbuf overshoot
Current autosizing in tcp_rcv_space_adjust() is too aggressive. Instead of betting on possible losses and over estimate BDP, it is better to only account for slow start. The following patch is then adding a more precise tuning in the events of packet losses. Signed-off-by: Eric Dumazet <edumazet@google.com> Link: https://patch.msgid.link/20250513193919.1089692-3-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent c1269d3 commit 65c5287

File tree

1 file changed

+25
-34
lines changed

1 file changed

+25
-34
lines changed

net/ipv4/tcp_input.c

Lines changed: 25 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,29 @@ static inline void tcp_rcv_rtt_measure_ts(struct sock *sk,
747747
}
748748
}
749749

750+
static void tcp_rcvbuf_grow(struct sock *sk)
751+
{
752+
const struct net *net = sock_net(sk);
753+
struct tcp_sock *tp = tcp_sk(sk);
754+
int rcvwin, rcvbuf, cap;
755+
756+
if (!READ_ONCE(net->ipv4.sysctl_tcp_moderate_rcvbuf) ||
757+
(sk->sk_userlocks & SOCK_RCVBUF_LOCK))
758+
return;
759+
760+
/* slow start: allow the sender to double its rate. */
761+
rcvwin = tp->rcvq_space.space << 1;
762+
763+
cap = READ_ONCE(net->ipv4.sysctl_tcp_rmem[2]);
764+
765+
rcvbuf = min_t(u32, tcp_space_from_win(sk, rcvwin), cap);
766+
if (rcvbuf > sk->sk_rcvbuf) {
767+
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
768+
/* Make the window clamp follow along. */
769+
WRITE_ONCE(tp->window_clamp,
770+
tcp_win_from_space(sk, rcvbuf));
771+
}
772+
}
750773
/*
751774
* This function should be called every time data is copied to user space.
752775
* It calculates the appropriate TCP receive buffer space.
@@ -771,42 +794,10 @@ void tcp_rcv_space_adjust(struct sock *sk)
771794

772795
trace_tcp_rcvbuf_grow(sk, time);
773796

774-
/* A bit of theory :
775-
* copied = bytes received in previous RTT, our base window
776-
* To cope with packet losses, we need a 2x factor
777-
* To cope with slow start, and sender growing its cwin by 100 %
778-
* every RTT, we need a 4x factor, because the ACK we are sending
779-
* now is for the next RTT, not the current one :
780-
* <prev RTT . ><current RTT .. ><next RTT .... >
781-
*/
782-
783-
if (READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_moderate_rcvbuf) &&
784-
!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
785-
u64 rcvwin, grow;
786-
int rcvbuf;
787-
788-
/* minimal window to cope with packet losses, assuming
789-
* steady state. Add some cushion because of small variations.
790-
*/
791-
rcvwin = ((u64)copied << 1) + 16 * tp->advmss;
792-
793-
/* Accommodate for sender rate increase (eg. slow start) */
794-
grow = rcvwin * (copied - tp->rcvq_space.space);
795-
do_div(grow, tp->rcvq_space.space);
796-
rcvwin += (grow << 1);
797-
798-
rcvbuf = min_t(u64, tcp_space_from_win(sk, rcvwin),
799-
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[2]));
800-
if (rcvbuf > sk->sk_rcvbuf) {
801-
WRITE_ONCE(sk->sk_rcvbuf, rcvbuf);
802-
803-
/* Make the window clamp follow along. */
804-
WRITE_ONCE(tp->window_clamp,
805-
tcp_win_from_space(sk, rcvbuf));
806-
}
807-
}
808797
tp->rcvq_space.space = copied;
809798

799+
tcp_rcvbuf_grow(sk);
800+
810801
new_measure:
811802
tp->rcvq_space.seq = tp->copied_seq;
812803
tp->rcvq_space.time = tp->tcp_mstamp;

0 commit comments

Comments
 (0)