Skip to content

Commit

Permalink
Merge branch 'tcp-options-lockless'
Browse files Browse the repository at this point in the history
Eric Dumazet says:

====================
tcp: set few options locklessly

This series is avoiding the socket lock for six TCP options.

They are not heavily used, but this exercise can give
ideas for other parts of TCP/IP stack :)
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
davem330 committed Aug 6, 2023
2 parents 8108307 + 6e97ba5 commit 16fd753
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 78 deletions.
2 changes: 1 addition & 1 deletion include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,6 @@ void __tcp_sock_set_nodelay(struct sock *sk, bool on);
void tcp_sock_set_nodelay(struct sock *sk);
void tcp_sock_set_quickack(struct sock *sk, int val);
int tcp_sock_set_syncnt(struct sock *sk, int val);
void tcp_sock_set_user_timeout(struct sock *sk, u32 val);
int tcp_sock_set_user_timeout(struct sock *sk, int val);

#endif /* _LINUX_TCP_H */
90 changes: 35 additions & 55 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2865,7 +2865,7 @@ void __tcp_close(struct sock *sk, long timeout)

if (sk->sk_state == TCP_FIN_WAIT2) {
struct tcp_sock *tp = tcp_sk(sk);
if (tp->linger2 < 0) {
if (READ_ONCE(tp->linger2) < 0) {
tcp_set_state(sk, TCP_CLOSE);
tcp_send_active_reset(sk, GFP_ATOMIC);
__NET_INC_STATS(sock_net(sk),
Expand Down Expand Up @@ -3291,18 +3291,21 @@ int tcp_sock_set_syncnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_SYNCNT)
return -EINVAL;

lock_sock(sk);
WRITE_ONCE(inet_csk(sk)->icsk_syn_retries, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_syncnt);

void tcp_sock_set_user_timeout(struct sock *sk, u32 val)
int tcp_sock_set_user_timeout(struct sock *sk, int val)
{
lock_sock(sk);
/* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
return -EINVAL;

WRITE_ONCE(inet_csk(sk)->icsk_user_timeout, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_user_timeout);

Expand Down Expand Up @@ -3345,9 +3348,7 @@ int tcp_sock_set_keepintvl(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPINTVL)
return -EINVAL;

lock_sock(sk);
WRITE_ONCE(tcp_sk(sk)->keepalive_intvl, val * HZ);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepintvl);
Expand All @@ -3357,10 +3358,8 @@ int tcp_sock_set_keepcnt(struct sock *sk, int val)
if (val < 1 || val > MAX_TCP_KEEPCNT)
return -EINVAL;

lock_sock(sk);
/* Paired with READ_ONCE() in keepalive_probes() */
WRITE_ONCE(tcp_sk(sk)->keepalive_probes, val);
release_sock(sk);
return 0;
}
EXPORT_SYMBOL(tcp_sock_set_keepcnt);
Expand Down Expand Up @@ -3462,6 +3461,32 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
if (copy_from_sockptr(&val, optval, sizeof(val)))
return -EFAULT;

/* Handle options that can be set without locking the socket. */
switch (optname) {
case TCP_SYNCNT:
return tcp_sock_set_syncnt(sk, val);
case TCP_USER_TIMEOUT:
return tcp_sock_set_user_timeout(sk, val);
case TCP_KEEPINTVL:
return tcp_sock_set_keepintvl(sk, val);
case TCP_KEEPCNT:
return tcp_sock_set_keepcnt(sk, val);
case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
return 0;
case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
return 0;
}

sockopt_lock_sock(sk);

switch (optname) {
Expand Down Expand Up @@ -3557,25 +3582,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
case TCP_KEEPIDLE:
err = tcp_sock_set_keepidle_locked(sk, val);
break;
case TCP_KEEPINTVL:
if (val < 1 || val > MAX_TCP_KEEPINTVL)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_intvl, val * HZ);
break;
case TCP_KEEPCNT:
if (val < 1 || val > MAX_TCP_KEEPCNT)
err = -EINVAL;
else
WRITE_ONCE(tp->keepalive_probes, val);
break;
case TCP_SYNCNT:
if (val < 1 || val > MAX_TCP_SYNCNT)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_syn_retries, val);
break;

case TCP_SAVE_SYN:
/* 0: disable, 1: enable, 2: start from ether_header */
if (val < 0 || val > 2)
Expand All @@ -3584,22 +3590,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
tp->save_syn = val;
break;

case TCP_LINGER2:
if (val < 0)
WRITE_ONCE(tp->linger2, -1);
else if (val > TCP_FIN_TIMEOUT_MAX / HZ)
WRITE_ONCE(tp->linger2, TCP_FIN_TIMEOUT_MAX);
else
WRITE_ONCE(tp->linger2, val * HZ);
break;

case TCP_DEFER_ACCEPT:
/* Translate value in seconds to number of retransmits */
WRITE_ONCE(icsk->icsk_accept_queue.rskq_defer_accept,
secs_to_retrans(val, TCP_TIMEOUT_INIT / HZ,
TCP_RTO_MAX / HZ));
break;

case TCP_WINDOW_CLAMP:
err = tcp_set_window_clamp(sk, val);
break;
Expand All @@ -3614,16 +3604,6 @@ int do_tcp_setsockopt(struct sock *sk, int level, int optname,
err = tp->af_specific->md5_parse(sk, optname, optval, optlen);
break;
#endif
case TCP_USER_TIMEOUT:
/* Cap the max time in ms TCP will retry or probe the window
* before giving up and aborting (ETIMEDOUT) a connection.
*/
if (val < 0)
err = -EINVAL;
else
WRITE_ONCE(icsk->icsk_user_timeout, val);
break;

case TCP_FASTOPEN:
if (val >= 0 && ((1 << sk->sk_state) & (TCPF_CLOSE |
TCPF_LISTEN))) {
Expand Down
4 changes: 2 additions & 2 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -6324,7 +6324,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
if (fastopen_fail)
return -1;
if (sk->sk_write_pending ||
icsk->icsk_accept_queue.rskq_defer_accept ||
READ_ONCE(icsk->icsk_accept_queue.rskq_defer_accept) ||
inet_csk_in_pingpong_mode(sk)) {
/* Save one ACK. Data will be ready after
* several ticks, if write_pending is set.
Expand Down Expand Up @@ -6624,7 +6624,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
break;
}

if (tp->linger2 < 0) {
if (READ_ONCE(tp->linger2) < 0) {
tcp_done(sk);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTONDATA);
return 1;
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_minisocks.c
Original file line number Diff line number Diff line change
Expand Up @@ -792,7 +792,7 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
return sk;

/* While TCP_DEFER_ACCEPT is active, drop bare ACK. */
if (req->num_timeout < inet_csk(sk)->icsk_accept_queue.rskq_defer_accept &&
if (req->num_timeout < READ_ONCE(inet_csk(sk)->icsk_accept_queue.rskq_defer_accept) &&
TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
inet_rsk(req)->acked = 1;
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPDEFERACCEPTDROP);
Expand Down
48 changes: 29 additions & 19 deletions net/ipv4/tcp_timer.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,15 @@
static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 elapsed, start_ts;
u32 elapsed, start_ts, user_timeout;
s32 remaining;

start_ts = tcp_sk(sk)->retrans_stamp;
if (!icsk->icsk_user_timeout)
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout)
return icsk->icsk_rto;
elapsed = tcp_time_stamp(tcp_sk(sk)) - start_ts;
remaining = icsk->icsk_user_timeout - elapsed;
remaining = user_timeout - elapsed;
if (remaining <= 0)
return 1; /* user timeout has passed; fire ASAP */

Expand All @@ -43,16 +44,17 @@ static u32 tcp_clamp_rto_to_user_timeout(const struct sock *sk)
u32 tcp_clamp_probe0_to_user_timeout(const struct sock *sk, u32 when)
{
struct inet_connection_sock *icsk = inet_csk(sk);
u32 remaining;
u32 remaining, user_timeout;
s32 elapsed;

if (!icsk->icsk_user_timeout || !icsk->icsk_probes_tstamp)
user_timeout = READ_ONCE(icsk->icsk_user_timeout);
if (!user_timeout || !icsk->icsk_probes_tstamp)
return when;

elapsed = tcp_jiffies32 - icsk->icsk_probes_tstamp;
if (unlikely(elapsed < 0))
elapsed = 0;
remaining = msecs_to_jiffies(icsk->icsk_user_timeout) - elapsed;
remaining = msecs_to_jiffies(user_timeout) - elapsed;
remaining = max_t(u32, remaining, TCP_TIMEOUT_MIN);

return min_t(u32, remaining, when);
Expand Down Expand Up @@ -239,7 +241,8 @@ static int tcp_write_timeout(struct sock *sk)
if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
if (icsk->icsk_retransmits)
__dst_negative_advice(sk);
retry_until = icsk->icsk_syn_retries ? :
/* Paired with WRITE_ONCE() in tcp_sock_set_syncnt() */
retry_until = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(net->ipv4.sysctl_tcp_syn_retries);

max_retransmits = retry_until;
Expand Down Expand Up @@ -269,7 +272,7 @@ static int tcp_write_timeout(struct sock *sk)
}
if (!expired)
expired = retransmits_timed_out(sk, retry_until,
icsk->icsk_user_timeout);
READ_ONCE(icsk->icsk_user_timeout));
tcp_fastopen_active_detect_blackhole(sk, expired);

if (BPF_SOCK_OPS_TEST_FLAG(tp, BPF_SOCK_OPS_RTO_CB_FLAG))
Expand Down Expand Up @@ -383,13 +386,16 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
if (!icsk->icsk_probes_tstamp)
if (!icsk->icsk_probes_tstamp) {
icsk->icsk_probes_tstamp = tcp_jiffies32;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(icsk->icsk_user_timeout))
goto abort;
} else {
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);

if (user_timeout &&
(s32)(tcp_jiffies32 - icsk->icsk_probes_tstamp) >=
msecs_to_jiffies(user_timeout))
goto abort;
}
max_probes = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_retries2);
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
Expand Down Expand Up @@ -421,8 +427,10 @@ static void tcp_fastopen_synack_timer(struct sock *sk, struct request_sock *req)

req->rsk_ops->syn_ack_timeout(req);

/* add one more retry for fastopen */
max_retries = icsk->icsk_syn_retries ? :
/* Add one more retry for fastopen.
* Paired with WRITE_ONCE() in tcp_sock_set_syncnt()
*/
max_retries = READ_ONCE(icsk->icsk_syn_retries) ? :
READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_synack_retries) + 1;

if (req->num_timeout >= max_retries) {
Expand Down Expand Up @@ -706,7 +714,7 @@ static void tcp_keepalive_timer (struct timer_list *t)

tcp_mstamp_refresh(tp);
if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
if (tp->linger2 >= 0) {
if (READ_ONCE(tp->linger2) >= 0) {
const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;

if (tmo > 0) {
Expand All @@ -731,13 +739,15 @@ static void tcp_keepalive_timer (struct timer_list *t)
elapsed = keepalive_time_elapsed(tp);

if (elapsed >= keepalive_time_when(tp)) {
u32 user_timeout = READ_ONCE(icsk->icsk_user_timeout);

/* If the TCP_USER_TIMEOUT option is enabled, use that
* to determine when to timeout instead.
*/
if ((icsk->icsk_user_timeout != 0 &&
elapsed >= msecs_to_jiffies(icsk->icsk_user_timeout) &&
if ((user_timeout != 0 &&
elapsed >= msecs_to_jiffies(user_timeout) &&
icsk->icsk_probes_out > 0) ||
(icsk->icsk_user_timeout == 0 &&
(user_timeout == 0 &&
icsk->icsk_probes_out >= keepalive_probes(tp))) {
tcp_send_active_reset(sk, GFP_ATOMIC);
tcp_write_err(sk);
Expand Down

0 comments on commit 16fd753

Please sign in to comment.