Revert "mptcp: correctly ensure to not overfill subflows"

matttbe · matttbe · commit ea0f3dc91ad4 · 2021-07-07T12:24:52.000+02:00
This reverts commit 088ed7d. It has been reported that this new commit is causing issues with kernels v4.14 and older. Probably more adaptions are needed to work on these old kernels. Best to revert this if it is causing more issues than what it solves on these old kernels. Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
diff --git a/include/net/tcp.h b/include/net/tcp.h
@@ -380,7 +380,6 @@ int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib);
 void tcp_event_new_data_sent(struct sock *sk, const struct sk_buff *skb);
 int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
 		     gfp_t gfp_mask);
-u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now);
 unsigned int tcp_mss_split_point(const struct sock *sk,
 				 const struct sk_buff *skb,
 				 unsigned int mss_now,
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
@@ -1756,7 +1756,7 @@ EXPORT_SYMBOL(tcp_tso_autosize);
 /* Return the number of segments we want in the skb we are transmitting.
  * See if congestion control module wants to decide; otherwise, autosize.
  */
-u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)
 {
 	const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
 	u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;
diff --git a/net/mptcp/mptcp_sched.c b/net/mptcp/mptcp_sched.c
@@ -1,6 +1,5 @@
 /* MPTCP Scheduler module selector. Highly inspired by tcp_cong.c */
 
-#include <linux/bug.h>
 #include <linux/module.h>
 #include <net/mptcp.h>
 
@@ -37,38 +36,12 @@ bool mptcp_is_def_unavailable(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(mptcp_is_def_unavailable);
 
-/* estimate number of segments currently in flight + unsent in
- * the subflow socket.
- */
-static int mptcp_subflow_queued(struct sock *sk, u32 max_tso_segs)
-{
-	const struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int queued;
-
-	/* estimate the max number of segments in the write queue
-	 * this is an overestimation, avoiding to iterate over the queue
-	 * to make a better estimation.
-	 * Having only one skb in the queue however might trigger tso deferral,
-	 * delaying the sending of a tso segment in the hope that skb_entail
-	 * will append more data to the skb soon.
-	 * Therefore, in the case only one skb is in the queue, we choose to
-	 * potentially underestimate, risking to schedule one skb too many onto
-	 * the subflow rather than not enough.
-	 */
-	if (sk->sk_write_queue.qlen > 1)
-		queued = sk->sk_write_queue.qlen * max_tso_segs;
-	else
-		queued = sk->sk_write_queue.qlen;
-
-	return queued + tcp_packets_in_flight(tp);
-}
-
 static bool mptcp_is_temp_unavailable(struct sock *sk,
 				      const struct sk_buff *skb,
 				      bool zero_wnd_test)
 {
 	const struct tcp_sock *tp = tcp_sk(sk);
-	unsigned int mss_now;
+	unsigned int mss_now, space, in_flight;
 
 	if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) {
 		/* If SACK is disabled, and we got a loss, TCP does not exit
@@ -92,11 +65,19 @@ static bool mptcp_is_temp_unavailable(struct sock *sk,
 			return true;
 	}
 
+	in_flight = tcp_packets_in_flight(tp);
+	/* Not even a single spot in the cwnd */
+	if (in_flight >= tp->snd_cwnd)
+		return true;
+
 	mss_now = tcp_current_mss(sk);
 
-	/* Not even a single spot in the cwnd */
-	if (mptcp_subflow_queued(sk, tcp_tso_segs(sk, tcp_current_mss(sk)))
-	    >= tp->snd_cwnd)
+	/* Now, check if what is queued in the subflow's send-queue
+	 * already fills the cwnd.
+	 */
+	space = (tp->snd_cwnd - in_flight) * mss_now;
+
+	if (tp->write_seq - tp->snd_nxt >= space)
 		return true;
 
 	if (zero_wnd_test && !before(tp->write_seq, tcp_wnd_end(tp)))
@@ -416,10 +397,11 @@ static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
 					  unsigned int *limit)
 {
 	struct sk_buff *skb = __mptcp_next_segment(meta_sk, reinject);
-	unsigned int mss_now;
-	u32 max_len, gso_max_segs, max_segs, max_tso_segs, window;
+	unsigned int mss_now, in_flight_space;
+	int remaining_in_flight_space;
+	u32 max_len, max_segs, window;
 	struct tcp_sock *subtp;
-	int queued;
+	u16 gso_max_segs;
 
 	/* As we set it, we have to reset it as well. */
 	*limit = 0;
@@ -457,30 +439,36 @@ static struct sk_buff *mptcp_next_segment(struct sock *meta_sk,
 	if (skb->len <= mss_now)
 		return skb;
 
-	max_tso_segs = tcp_tso_segs(*subsk, tcp_current_mss(*subsk));
-	queued = mptcp_subflow_queued(*subsk, max_tso_segs);
-
-	/* this condition should already have been established in
-	 * mptcp_is_temp_unavailable when selecting available flows
+	/* The following is similar to tcp_mss_split_point, but
+	 * we do not care about nagle, because we will anyways
+	 * use TCP_NAGLE_PUSH, which overrides this.
 	 */
-	WARN_ONCE(subtp->snd_cwnd <= queued, "Selected subflow no cwnd room");
 
 	gso_max_segs = (*subsk)->sk_gso_max_segs;
 	if (!gso_max_segs) /* No gso supported on the subflow's NIC */
 		gso_max_segs = 1;
-
-	max_segs = min_t(unsigned int, subtp->snd_cwnd - queued, gso_max_segs);
+	max_segs = min_t(unsigned int, tcp_cwnd_test(subtp, skb), gso_max_segs);
 	if (!max_segs)
 		return NULL;
 
-	/* if there is room for a segment, schedule up to a complete TSO
-	 * segment to avoid TSO splitting. Even if it is more than allowed by
-	 * the congestion window.
+	/* max_len is what would fit in the cwnd (respecting the 2GSO-limit of
+	 * tcp_cwnd_test), but ignoring whatever was already queued.
 	 */
-	max_segs = max_t(unsigned int, max_tso_segs, max_segs);
-
 	max_len = min(mss_now * max_segs, skb->len);
 
+	in_flight_space = (subtp->snd_cwnd - tcp_packets_in_flight(subtp)) * mss_now;
+	remaining_in_flight_space = (int)in_flight_space - (subtp->write_seq - subtp->snd_nxt);
+
+	if (remaining_in_flight_space <= 0)
+		WARN_ONCE(1, "in_flight %u cwnd %u wseq %u snxt %u mss_now %u cache %u",
+			  tcp_packets_in_flight(subtp), subtp->snd_cwnd,
+			  subtp->write_seq, subtp->snd_nxt, mss_now, subtp->mss_cache);
+	else
+		/* max_len now fits exactly in the write-queue, taking into
+		 * account what was already queued.
+		 */
+		max_len = min_t(u32, max_len, remaining_in_flight_space);
+
 	window = tcp_wnd_end(subtp) - subtp->write_seq;
 
 	/* max_len now also respects the announced receive-window */

Original file line number	Diff line number	Diff line change
`@@ -1756,7 +1756,7 @@ EXPORT_SYMBOL(tcp_tso_autosize);`
`1756`	`1756`	`/* Return the number of segments we want in the skb we are transmitting.`
`1757`	`1757`	`* See if congestion control module wants to decide; otherwise, autosize.`
`1758`	`1758`	`*/`
`1759`		`-u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)`
	`1759`	`+static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now)`
`1760`	`1760`	`{`
`1761`	`1761`	`const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;`
`1762`	`1762`	`u32 tso_segs = ca_ops->tso_segs_goal ? ca_ops->tso_segs_goal(sk) : 0;`