Skip to content

Commit

Permalink
[TCP]: Move to new TSO segmenting scheme.
Browse files Browse the repository at this point in the history
Make TSO segment transmit size decisions at send time not earlier.

The basic scheme is that we try to build as large a TSO frame as
possible when pulling in the user data, but the size of the TSO frame
output to the card is determined at transmit time.

This is guided by tp->xmit_size_goal.  It is always set to a multiple
of MSS and tells sendmsg/sendpage how large an SKB to try and build.

Later, tcp_write_xmit() and tcp_push_one() chop up the packet if
necessary and conditions warrant.  These routines can also decide to
"defer" in order to wait for more ACKs to arrive and thus allow larger
TSO frames to be emitted.

A general observation is that TSO elongates the pipe, thus requiring a
larger congestion window and larger buffering especially at the sender
side.  Therefore, it is important that applications 1) get a large
enough socket send buffer (this is accomplished by our dynamic send
buffer expansion code) 2) do large enough writes.

Signed-off-by: David S. Miller <davem@davemloft.net>
  • Loading branch information
davem330 committed Jul 5, 2005
1 parent 0d9901d commit c1b4a7e
Show file tree
Hide file tree
Showing 7 changed files with 384 additions and 240 deletions.
2 changes: 1 addition & 1 deletion include/linux/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -286,7 +286,7 @@ struct tcp_sock {
__u32 max_window; /* Maximal window ever seen from peer */
__u32 pmtu_cookie; /* Last pmtu seen by socket */
__u32 mss_cache; /* Cached effective mss, not including SACKS */
__u16 mss_cache_std; /* Like mss_cache, but without TSO */
__u16 xmit_size_goal; /* Goal for segmenting output packets */
__u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */
__u8 ca_state; /* State of fast-retransmit machine */
__u8 retransmits; /* Number of unrecovered RTO timeouts. */
Expand Down
4 changes: 2 additions & 2 deletions include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -862,7 +862,7 @@ extern int tcp_write_wakeup(struct sock *);
extern void tcp_send_fin(struct sock *sk);
extern void tcp_send_active_reset(struct sock *sk, int priority);
extern int tcp_send_synack(struct sock *);
extern void tcp_push_one(struct sock *, unsigned mss_now);
extern void tcp_push_one(struct sock *, unsigned int mss_now);
extern void tcp_send_ack(struct sock *sk);
extern void tcp_send_delayed_ack(struct sock *sk);

Expand Down Expand Up @@ -968,7 +968,7 @@ static inline void tcp_reset_xmit_timer(struct sock *sk, int what, unsigned long
static inline void tcp_initialize_rcv_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
unsigned int hint = min(tp->advmss, tp->mss_cache_std);
unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache);

hint = min(hint, tp->rcv_wnd/2);
hint = min(hint, TCP_MIN_RCVMSS);
Expand Down
26 changes: 15 additions & 11 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
size_t psize, int flags)
{
struct tcp_sock *tp = tcp_sk(sk);
int mss_now;
int mss_now, size_goal;
int err;
ssize_t copied;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
Expand All @@ -628,6 +628,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
copied = 0;

err = -EPIPE;
Expand All @@ -641,7 +642,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
int offset = poffset % PAGE_SIZE;
int size = min_t(size_t, psize, PAGE_SIZE - offset);

if (!sk->sk_send_head || (copy = mss_now - skb->len) <= 0) {
if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
Expand All @@ -652,7 +653,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
goto wait_for_memory;

skb_entail(sk, tp, skb);
copy = mss_now;
copy = size_goal;
}

if (copy > size)
Expand Down Expand Up @@ -693,7 +694,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
if (!(psize -= copy))
goto out;

if (skb->len != mss_now || (flags & MSG_OOB))
if (skb->len < mss_now || (flags & MSG_OOB))
continue;

if (forced_push(tp)) {
Expand All @@ -713,6 +714,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse
goto do_error;

mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
}

out:
Expand Down Expand Up @@ -754,7 +756,7 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset,

static inline int select_size(struct sock *sk, struct tcp_sock *tp)
{
int tmp = tp->mss_cache_std;
int tmp = tp->mss_cache;

if (sk->sk_route_caps & NETIF_F_SG) {
if (sk->sk_route_caps & NETIF_F_TSO)
Expand All @@ -778,7 +780,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
int iovlen, flags;
int mss_now;
int mss_now, size_goal;
int err, copied;
long timeo;

Expand All @@ -797,6 +799,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);

mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;

/* Ok commence sending. */
iovlen = msg->msg_iovlen;
Expand All @@ -819,7 +822,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
skb = sk->sk_write_queue.prev;

if (!sk->sk_send_head ||
(copy = mss_now - skb->len) <= 0) {
(copy = size_goal - skb->len) <= 0) {

new_segment:
/* Allocate new segment. If the interface is SG,
Expand All @@ -842,7 +845,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
skb->ip_summed = CHECKSUM_HW;

skb_entail(sk, tp, skb);
copy = mss_now;
copy = size_goal;
}

/* Try to append data to the end of skb. */
Expand Down Expand Up @@ -937,7 +940,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
if ((seglen -= copy) == 0 && iovlen == 0)
goto out;

if (skb->len != mss_now || (flags & MSG_OOB))
if (skb->len < mss_now || (flags & MSG_OOB))
continue;

if (forced_push(tp)) {
Expand All @@ -957,6 +960,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
goto do_error;

mss_now = tcp_current_mss(sk, !(flags&MSG_OOB));
size_goal = tp->xmit_size_goal;
}
}

Expand Down Expand Up @@ -2128,7 +2132,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)

info->tcpi_rto = jiffies_to_usecs(tp->rto);
info->tcpi_ato = jiffies_to_usecs(tp->ack.ato);
info->tcpi_snd_mss = tp->mss_cache_std;
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = tp->ack.rcv_mss;

info->tcpi_unacked = tp->packets_out;
Expand Down Expand Up @@ -2178,7 +2182,7 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval,

switch (optname) {
case TCP_MAXSEG:
val = tp->mss_cache_std;
val = tp->mss_cache;
if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
val = tp->rx_opt.user_mss;
break;
Expand Down
10 changes: 5 additions & 5 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -740,10 +740,10 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst)
__u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0);

if (!cwnd) {
if (tp->mss_cache_std > 1460)
if (tp->mss_cache > 1460)
cwnd = 2;
else
cwnd = (tp->mss_cache_std > 1095) ? 3 : 4;
cwnd = (tp->mss_cache > 1095) ? 3 : 4;
}
return min_t(__u32, cwnd, tp->snd_cwnd_clamp);
}
Expand Down Expand Up @@ -914,7 +914,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
if (sk->sk_route_caps & NETIF_F_TSO) {
sk->sk_route_caps &= ~NETIF_F_TSO;
sock_set_flag(sk, SOCK_NO_LARGESEND);
tp->mss_cache = tp->mss_cache_std;
tp->mss_cache = tp->mss_cache;
}

if (!tp->sacked_out)
Expand Down Expand Up @@ -1077,7 +1077,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
(IsFack(tp) ||
!before(lost_retrans,
TCP_SKB_CB(skb)->ack_seq + tp->reordering *
tp->mss_cache_std))) {
tp->mss_cache))) {
TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);

Expand Down Expand Up @@ -3334,7 +3334,7 @@ static void tcp_new_space(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);

if (tcp_should_expand_sndbuf(sk, tp)) {
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache_std) +
int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) +
MAX_TCP_HEADER + 16 + sizeof(struct sk_buff),
demanded = max_t(unsigned int, tp->snd_cwnd,
tp->reordering + 1);
Expand Down
2 changes: 1 addition & 1 deletion net/ipv4/tcp_ipv4.c
Original file line number Diff line number Diff line change
Expand Up @@ -2045,7 +2045,7 @@ static int tcp_v4_init_sock(struct sock *sk)
*/
tp->snd_ssthresh = 0x7fffffff; /* Infinity */
tp->snd_cwnd_clamp = ~0;
tp->mss_cache_std = tp->mss_cache = 536;
tp->mss_cache = 536;

tp->reordering = sysctl_tcp_reordering;
tp->ca_ops = &tcp_init_congestion_ops;
Expand Down
Loading

0 comments on commit c1b4a7e

Please sign in to comment.