Skip to content

Commit eeaedc5

Browse files
dhowellskuba-moo
authored andcommitted
rxrpc: Implement path-MTU probing using padded PING ACKs (RFC8899)
Implement path-MTU probing (along the lines of RFC8899) by padding some of the PING ACKs we send. PING ACKs get their own individual responses quite apart from the acking of data (though, as ACKs, they fulfil that role also). The probing concentrates on packet sizes that correspond how many subpackets can be stuffed inside a jumbo packet as jumbo DATA packets are just aggregations of individual DATA packets and can be split easily for retransmission purposes. If we want to perform probing, we advertise this by setting the maximum number of jumbo subpackets to 0 in the ack trailer when we send an ACK and see if the peer is also advertising the service. This is interpreted by non-supporting Rx stacks as an indication that jumbo packets aren't supported. The MTU sizes advertised in the ACK trailer AF_RXRPC transmits are pegged at a maximum of 1444 unless pmtud is supported by both sides. Signed-off-by: David Howells <dhowells@redhat.com> cc: Marc Dionne <marc.dionne@auristor.com> cc: linux-afs@lists.infradead.org Link: https://patch.msgid.link/20241204074710.990092-10-dhowells@redhat.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 420f8af commit eeaedc5

File tree

15 files changed

+382
-57
lines changed

15 files changed

+382
-57
lines changed

include/trace/events/rxrpc.h

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,7 @@
364364
EM(rxrpc_propose_ack_ping_for_lost_ack, "LostAck") \
365365
EM(rxrpc_propose_ack_ping_for_lost_reply, "LostRpl") \
366366
EM(rxrpc_propose_ack_ping_for_0_retrans, "0-Retrn") \
367+
EM(rxrpc_propose_ack_ping_for_mtu_probe, "MTUProb") \
367368
EM(rxrpc_propose_ack_ping_for_old_rtt, "OldRtt ") \
368369
EM(rxrpc_propose_ack_ping_for_params, "Params ") \
369370
EM(rxrpc_propose_ack_ping_for_rtt, "Rtt ") \
@@ -478,6 +479,11 @@
478479
EM(rxrpc_txbuf_see_send_more, "SEE SEND+ ") \
479480
E_(rxrpc_txbuf_see_unacked, "SEE UNACKED")
480481

482+
#define rxrpc_pmtud_reduce_traces \
483+
EM(rxrpc_pmtud_reduce_ack, "Ack ") \
484+
EM(rxrpc_pmtud_reduce_icmp, "Icmp ") \
485+
E_(rxrpc_pmtud_reduce_route, "Route")
486+
481487
/*
482488
* Generate enums for tracing information.
483489
*/
@@ -498,6 +504,7 @@ enum rxrpc_congest_change { rxrpc_congest_changes } __mode(byte);
498504
enum rxrpc_conn_trace { rxrpc_conn_traces } __mode(byte);
499505
enum rxrpc_local_trace { rxrpc_local_traces } __mode(byte);
500506
enum rxrpc_peer_trace { rxrpc_peer_traces } __mode(byte);
507+
enum rxrpc_pmtud_reduce_trace { rxrpc_pmtud_reduce_traces } __mode(byte);
501508
enum rxrpc_propose_ack_outcome { rxrpc_propose_ack_outcomes } __mode(byte);
502509
enum rxrpc_propose_ack_trace { rxrpc_propose_ack_traces } __mode(byte);
503510
enum rxrpc_receive_trace { rxrpc_receive_traces } __mode(byte);
@@ -534,6 +541,7 @@ rxrpc_congest_changes;
534541
rxrpc_congest_modes;
535542
rxrpc_conn_traces;
536543
rxrpc_local_traces;
544+
rxrpc_pmtud_reduce_traces;
537545
rxrpc_propose_ack_traces;
538546
rxrpc_receive_traces;
539547
rxrpc_recvmsg_traces;
@@ -2040,6 +2048,122 @@ TRACE_EVENT(rxrpc_sack,
20402048
__entry->sack)
20412049
);
20422050

2051+
TRACE_EVENT(rxrpc_pmtud_tx,
2052+
TP_PROTO(struct rxrpc_call *call),
2053+
2054+
TP_ARGS(call),
2055+
2056+
TP_STRUCT__entry(
2057+
__field(unsigned int, peer_debug_id)
2058+
__field(unsigned int, call_debug_id)
2059+
__field(rxrpc_serial_t, ping_serial)
2060+
__field(unsigned short, pmtud_trial)
2061+
__field(unsigned short, pmtud_good)
2062+
__field(unsigned short, pmtud_bad)
2063+
),
2064+
2065+
TP_fast_assign(
2066+
__entry->peer_debug_id = call->peer->debug_id;
2067+
__entry->call_debug_id = call->debug_id;
2068+
__entry->ping_serial = call->conn->pmtud_probe;
2069+
__entry->pmtud_trial = call->peer->pmtud_trial;
2070+
__entry->pmtud_good = call->peer->pmtud_good;
2071+
__entry->pmtud_bad = call->peer->pmtud_bad;
2072+
),
2073+
2074+
TP_printk("P=%08x c=%08x pr=%08x %u-%u-%u",
2075+
__entry->peer_debug_id,
2076+
__entry->call_debug_id,
2077+
__entry->ping_serial,
2078+
__entry->pmtud_good,
2079+
__entry->pmtud_trial,
2080+
__entry->pmtud_bad)
2081+
);
2082+
2083+
TRACE_EVENT(rxrpc_pmtud_rx,
2084+
TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),
2085+
2086+
TP_ARGS(conn, resp_serial),
2087+
2088+
TP_STRUCT__entry(
2089+
__field(unsigned int, peer_debug_id)
2090+
__field(unsigned int, call_debug_id)
2091+
__field(rxrpc_serial_t, ping_serial)
2092+
__field(rxrpc_serial_t, resp_serial)
2093+
__field(unsigned short, max_data)
2094+
__field(u8, jumbo_max)
2095+
),
2096+
2097+
TP_fast_assign(
2098+
__entry->peer_debug_id = conn->peer->debug_id;
2099+
__entry->call_debug_id = conn->pmtud_call;
2100+
__entry->ping_serial = conn->pmtud_probe;
2101+
__entry->resp_serial = resp_serial;
2102+
__entry->max_data = conn->peer->max_data;
2103+
__entry->jumbo_max = conn->peer->pmtud_jumbo;
2104+
),
2105+
2106+
TP_printk("P=%08x c=%08x pr=%08x rr=%08x max=%u jm=%u",
2107+
__entry->peer_debug_id,
2108+
__entry->call_debug_id,
2109+
__entry->ping_serial,
2110+
__entry->resp_serial,
2111+
__entry->max_data,
2112+
__entry->jumbo_max)
2113+
);
2114+
2115+
TRACE_EVENT(rxrpc_pmtud_lost,
2116+
TP_PROTO(struct rxrpc_connection *conn, rxrpc_serial_t resp_serial),
2117+
2118+
TP_ARGS(conn, resp_serial),
2119+
2120+
TP_STRUCT__entry(
2121+
__field(unsigned int, peer_debug_id)
2122+
__field(unsigned int, call_debug_id)
2123+
__field(rxrpc_serial_t, ping_serial)
2124+
__field(rxrpc_serial_t, resp_serial)
2125+
),
2126+
2127+
TP_fast_assign(
2128+
__entry->peer_debug_id = conn->peer->debug_id;
2129+
__entry->call_debug_id = conn->pmtud_call;
2130+
__entry->ping_serial = conn->pmtud_probe;
2131+
__entry->resp_serial = resp_serial;
2132+
),
2133+
2134+
TP_printk("P=%08x c=%08x pr=%08x rr=%08x",
2135+
__entry->peer_debug_id,
2136+
__entry->call_debug_id,
2137+
__entry->ping_serial,
2138+
__entry->resp_serial)
2139+
);
2140+
2141+
TRACE_EVENT(rxrpc_pmtud_reduce,
2142+
TP_PROTO(struct rxrpc_peer *peer, rxrpc_serial_t serial,
2143+
unsigned int max_data, enum rxrpc_pmtud_reduce_trace reason),
2144+
2145+
TP_ARGS(peer, serial, max_data, reason),
2146+
2147+
TP_STRUCT__entry(
2148+
__field(unsigned int, peer_debug_id)
2149+
__field(rxrpc_serial_t, serial)
2150+
__field(unsigned int, max_data)
2151+
__field(enum rxrpc_pmtud_reduce_trace, reason)
2152+
),
2153+
2154+
TP_fast_assign(
2155+
__entry->peer_debug_id = peer->debug_id;
2156+
__entry->serial = serial;
2157+
__entry->max_data = max_data;
2158+
__entry->reason = reason;
2159+
),
2160+
2161+
TP_printk("P=%08x %s r=%08x m=%u",
2162+
__entry->peer_debug_id,
2163+
__print_symbolic(__entry->reason, rxrpc_pmtud_reduce_traces),
2164+
__entry->serial, __entry->max_data)
2165+
);
2166+
20432167
#undef EM
20442168
#undef E_
20452169

net/rxrpc/ar-internal.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -344,13 +344,25 @@ struct rxrpc_peer {
344344
time64_t last_tx_at; /* Last time packet sent here */
345345
seqlock_t service_conn_lock;
346346
spinlock_t lock; /* access lock */
347-
unsigned int if_mtu; /* interface MTU for this peer */
348-
unsigned int mtu; /* network MTU for this peer */
349-
unsigned int maxdata; /* data size (MTU - hdrsize) */
350-
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
351347
int debug_id; /* debug ID for printks */
352348
struct sockaddr_rxrpc srx; /* remote address */
353349

350+
/* Path MTU discovery [RFC8899] */
351+
unsigned int pmtud_trial; /* Current MTU probe size */
352+
unsigned int pmtud_good; /* Largest working MTU probe we've tried */
353+
unsigned int pmtud_bad; /* Smallest non-working MTU probe we've tried */
354+
bool pmtud_lost; /* T if MTU probe was lost */
355+
bool pmtud_probing; /* T if we have an active probe outstanding */
356+
bool pmtud_pending; /* T if a call to this peer should send a probe */
357+
u8 pmtud_jumbo; /* Max jumbo packets for the MTU */
358+
bool ackr_adv_pmtud; /* T if the peer advertises path-MTU */
359+
unsigned int ackr_max_data; /* Maximum data advertised by peer */
360+
seqcount_t mtu_lock; /* Lockless MTU access management */
361+
unsigned int if_mtu; /* Local interface MTU (- hdrsize) for this peer */
362+
unsigned int max_data; /* Maximum packet data capacity for this peer */
363+
unsigned short hdrsize; /* header size (IP + UDP + RxRPC) */
364+
unsigned short tx_seg_max; /* Maximum number of transmissable segments */
365+
354366
/* calculated RTT cache */
355367
#define RXRPC_RTT_CACHE_SIZE 32
356368
spinlock_t rtt_input_lock; /* RTT lock for input routine */
@@ -531,6 +543,8 @@ struct rxrpc_connection {
531543
int debug_id; /* debug ID for printks */
532544
rxrpc_serial_t tx_serial; /* Outgoing packet serial number counter */
533545
unsigned int hi_serial; /* highest serial number received */
546+
rxrpc_serial_t pmtud_probe; /* Serial of MTU probe (or 0) */
547+
unsigned int pmtud_call; /* ID of call used for probe */
534548
u32 service_id; /* Service ID, possibly upgraded */
535549
u32 security_level; /* Security level selected */
536550
u8 security_ix; /* security type */
@@ -1155,6 +1169,7 @@ static inline struct rxrpc_net *rxrpc_net(struct net *net)
11551169
*/
11561170
void rxrpc_send_ACK(struct rxrpc_call *call, u8 ack_reason,
11571171
rxrpc_serial_t serial, enum rxrpc_propose_ack_trace why);
1172+
void rxrpc_send_probe_for_pmtud(struct rxrpc_call *call);
11581173
int rxrpc_send_abort_packet(struct rxrpc_call *);
11591174
void rxrpc_send_conn_abort(struct rxrpc_connection *conn);
11601175
void rxrpc_reject_packet(struct rxrpc_local *local, struct sk_buff *skb);
@@ -1166,6 +1181,8 @@ void rxrpc_transmit_one(struct rxrpc_call *call, struct rxrpc_txbuf *txb);
11661181
*/
11671182
void rxrpc_input_error(struct rxrpc_local *, struct sk_buff *);
11681183
void rxrpc_peer_keepalive_worker(struct work_struct *);
1184+
void rxrpc_input_probe_for_pmtud(struct rxrpc_connection *conn, rxrpc_serial_t acked_serial,
1185+
bool sendmsg_fail);
11691186

11701187
/*
11711188
* peer_object.c

net/rxrpc/call_event.c

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -483,6 +483,11 @@ bool rxrpc_input_call_event(struct rxrpc_call *call, struct sk_buff *skb)
483483
rxrpc_disconnect_call(call);
484484
if (call->security)
485485
call->security->free_call_crypto(call);
486+
} else {
487+
if (skb &&
488+
call->peer->ackr_adv_pmtud &&
489+
call->peer->pmtud_pending)
490+
rxrpc_send_probe_for_pmtud(call);
486491
}
487492
if (call->acks_hard_ack != call->tx_bottom)
488493
rxrpc_shrink_call_tx_buffer(call);

net/rxrpc/conn_event.c

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
9292
struct rxrpc_acktrailer trailer;
9393
size_t len;
9494
int ret, ioc;
95-
u32 serial, mtu, call_id, padding;
95+
u32 serial, max_mtu, if_mtu, call_id, padding;
9696

9797
_enter("%d", conn->debug_id);
9898

@@ -150,19 +150,24 @@ void rxrpc_conn_retransmit_call(struct rxrpc_connection *conn,
150150
break;
151151

152152
case RXRPC_PACKET_TYPE_ACK:
153-
mtu = conn->peer->if_mtu;
154-
mtu -= conn->peer->hdrsize;
153+
if_mtu = conn->peer->if_mtu - conn->peer->hdrsize;
154+
if (conn->peer->ackr_adv_pmtud) {
155+
max_mtu = umax(conn->peer->max_data, rxrpc_rx_mtu);
156+
} else {
157+
if_mtu = umin(1444, if_mtu);
158+
max_mtu = if_mtu;
159+
}
155160
pkt.ack.bufferSpace = 0;
156161
pkt.ack.maxSkew = htons(skb ? skb->priority : 0);
157162
pkt.ack.firstPacket = htonl(chan->last_seq + 1);
158163
pkt.ack.previousPacket = htonl(chan->last_seq);
159164
pkt.ack.serial = htonl(skb ? sp->hdr.serial : 0);
160165
pkt.ack.reason = skb ? RXRPC_ACK_DUPLICATE : RXRPC_ACK_IDLE;
161166
pkt.ack.nAcks = 0;
162-
trailer.maxMTU = htonl(rxrpc_rx_mtu);
163-
trailer.ifMTU = htonl(mtu);
167+
trailer.maxMTU = htonl(max_mtu);
168+
trailer.ifMTU = htonl(if_mtu);
164169
trailer.rwind = htonl(rxrpc_rx_window_size);
165-
trailer.jumbo_max = htonl(rxrpc_rx_jumbo_max);
170+
trailer.jumbo_max = 0;
166171
pkt.whdr.flags |= RXRPC_SLOW_START_OK;
167172
padding = 0;
168173
iov[0].iov_len += sizeof(pkt.ack);

net/rxrpc/conn_object.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,12 @@ static void rxrpc_clean_up_connection(struct work_struct *work)
321321
list_del_init(&conn->proc_link);
322322
write_unlock(&rxnet->conn_lock);
323323

324+
if (conn->pmtud_probe) {
325+
trace_rxrpc_pmtud_lost(conn, 0);
326+
conn->peer->pmtud_probing = false;
327+
conn->peer->pmtud_pending = true;
328+
}
329+
324330
rxrpc_purge_queue(&conn->rx_queue);
325331

326332
rxrpc_kill_client_conn(conn);

net/rxrpc/input.c

Lines changed: 17 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -692,8 +692,8 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
692692
struct rxrpc_acktrailer *trailer)
693693
{
694694
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
695-
struct rxrpc_peer *peer;
696-
unsigned int mtu;
695+
struct rxrpc_peer *peer = call->peer;
696+
unsigned int max_data;
697697
bool wake = false;
698698
u32 rwind = ntohl(trailer->rwind);
699699

@@ -706,14 +706,22 @@ static void rxrpc_input_ack_trailer(struct rxrpc_call *call, struct sk_buff *skb
706706
call->tx_winsize = rwind;
707707
}
708708

709-
mtu = umin(ntohl(trailer->maxMTU), ntohl(trailer->ifMTU));
709+
if (trailer->jumbo_max == 0) {
710+
/* The peer says it supports pmtu discovery */
711+
peer->ackr_adv_pmtud = true;
712+
} else {
713+
peer->ackr_adv_pmtud = false;
714+
}
715+
716+
max_data = ntohl(trailer->maxMTU);
717+
peer->ackr_max_data = max_data;
710718

711-
peer = call->peer;
712-
if (mtu < peer->maxdata) {
713-
spin_lock(&peer->lock);
714-
peer->maxdata = mtu;
715-
peer->mtu = mtu + peer->hdrsize;
716-
spin_unlock(&peer->lock);
719+
if (max_data < peer->max_data) {
720+
trace_rxrpc_pmtud_reduce(peer, sp->hdr.serial, max_data,
721+
rxrpc_pmtud_reduce_ack);
722+
write_seqcount_begin(&peer->mtu_lock);
723+
peer->max_data = max_data;
724+
write_seqcount_end(&peer->mtu_lock);
717725
}
718726

719727
if (wake)

net/rxrpc/io_thread.c

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -364,6 +364,12 @@ static int rxrpc_input_packet_on_conn(struct rxrpc_connection *conn,
364364
if (sp->hdr.callNumber == 0)
365365
return rxrpc_input_conn_packet(conn, skb);
366366

367+
/* Deal with path MTU discovery probing. */
368+
if (sp->hdr.type == RXRPC_PACKET_TYPE_ACK &&
369+
conn->pmtud_probe &&
370+
after_eq(sp->ack.acked_serial, conn->pmtud_probe))
371+
rxrpc_input_probe_for_pmtud(conn, sp->ack.acked_serial, false);
372+
367373
/* Call-bound packets are routed by connection channel. */
368374
channel = sp->hdr.cid & RXRPC_CHANNELMASK;
369375
chan = &conn->channels[channel];

net/rxrpc/misc.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,13 @@ unsigned int rxrpc_rx_window_size = 255;
4646
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
4747
* made by gluing normal packets together that we're willing to handle.
4848
*/
49-
unsigned int rxrpc_rx_mtu = 5692;
49+
unsigned int rxrpc_rx_mtu = RXRPC_JUMBO(46);
5050

5151
/*
5252
* The maximum number of fragments in a received jumbo packet that we tell the
5353
* sender that we're willing to handle.
5454
*/
55-
unsigned int rxrpc_rx_jumbo_max = 4;
55+
unsigned int rxrpc_rx_jumbo_max = 46;
5656

5757
#ifdef CONFIG_AF_RXRPC_INJECT_RX_DELAY
5858
/*

0 commit comments

Comments
 (0)