Skip to content

Commit dc82a33

Browse files
netoptimizerkuba-moo
authored andcommitted
veth: apply qdisc backpressure on full ptr_ring to reduce TX drops
In production, we're seeing TX drops on veth devices when the ptr_ring fills up. This can occur when NAPI mode is enabled, though it's relatively rare. However, with threaded NAPI - which we use in production - the drops become significantly more frequent. The underlying issue is that with threaded NAPI, the consumer often runs on a different CPU than the producer. This increases the likelihood of the ring filling up before the consumer gets scheduled, especially under load, leading to drops in veth_xmit() (ndo_start_xmit()). This patch introduces backpressure by returning NETDEV_TX_BUSY when the ring is full, signaling the qdisc layer to requeue the packet. The txq (netdev queue) is stopped in this condition and restarted once veth_poll() drains entries from the ring, ensuring coordination between NAPI and qdisc. Backpressure is only enabled when a qdisc is attached. Without a qdisc, the driver retains its original behavior - dropping packets immediately when the ring is full. This avoids unexpected behavior changes in setups without a configured qdisc. With a qdisc in place (e.g. fq, sfq) this allows Active Queue Management (AQM) to fairly schedule packets across flows and reduce collateral damage from elephant flows. A known limitation of this approach is that the full ring sits in front of the qdisc layer, effectively forming a FIFO buffer that introduces base latency. While AQM still improves fairness and mitigates flow dominance, the latency impact is measurable. In hardware drivers, this issue is typically addressed using BQL (Byte Queue Limits), which tracks in-flight bytes needed based on physical link rate. However, for virtual drivers like veth, there is no fixed bandwidth constraint - the bottleneck is CPU availability and the scheduler's ability to run the NAPI thread. It is unclear how effective BQL would be in this context. This patch serves as a first step toward addressing TX drops. Future work may explore adapting a BQL-like mechanism to better suit virtual devices like veth. Reported-by: Yan Zhai <yan@cloudflare.com> Signed-off-by: Jesper Dangaard Brouer <hawk@kernel.org> Reviewed-by: Toshiaki Makita <toshiaki.makita1@gmail.com> Link: https://patch.msgid.link/174559294022.827981.1282809941662942189.stgit@firesoul Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent 34dd0fe commit dc82a33

File tree

1 file changed

+47
-10
lines changed

1 file changed

+47
-10
lines changed

drivers/net/veth.c

Lines changed: 47 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -307,12 +307,10 @@ static void __veth_xdp_flush(struct veth_rq *rq)
307307

308308
static int veth_xdp_rx(struct veth_rq *rq, struct sk_buff *skb)
309309
{
310-
if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb))) {
311-
dev_kfree_skb_any(skb);
312-
return NET_RX_DROP;
313-
}
310+
if (unlikely(ptr_ring_produce(&rq->xdp_ring, skb)))
311+
return NETDEV_TX_BUSY; /* signal qdisc layer */
314312

315-
return NET_RX_SUCCESS;
313+
return NET_RX_SUCCESS; /* same as NETDEV_TX_OK */
316314
}
317315

318316
static int veth_forward_skb(struct net_device *dev, struct sk_buff *skb,
@@ -346,11 +344,11 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
346344
{
347345
struct veth_priv *rcv_priv, *priv = netdev_priv(dev);
348346
struct veth_rq *rq = NULL;
349-
int ret = NETDEV_TX_OK;
347+
struct netdev_queue *txq;
350348
struct net_device *rcv;
351349
int length = skb->len;
352350
bool use_napi = false;
353-
int rxq;
351+
int ret, rxq;
354352

355353
rcu_read_lock();
356354
rcv = rcu_dereference(priv->peer);
@@ -373,17 +371,45 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
373371
}
374372

375373
skb_tx_timestamp(skb);
376-
if (likely(veth_forward_skb(rcv, skb, rq, use_napi) == NET_RX_SUCCESS)) {
374+
375+
ret = veth_forward_skb(rcv, skb, rq, use_napi);
376+
switch (ret) {
377+
case NET_RX_SUCCESS: /* same as NETDEV_TX_OK */
377378
if (!use_napi)
378379
dev_sw_netstats_tx_add(dev, 1, length);
379380
else
380381
__veth_xdp_flush(rq);
381-
} else {
382+
break;
383+
case NETDEV_TX_BUSY:
384+
/* If a qdisc is attached to our virtual device, returning
385+
* NETDEV_TX_BUSY is allowed.
386+
*/
387+
txq = netdev_get_tx_queue(dev, rxq);
388+
389+
if (qdisc_txq_has_no_queue(txq)) {
390+
dev_kfree_skb_any(skb);
391+
goto drop;
392+
}
393+
/* Restore Eth hdr pulled by dev_forward_skb/eth_type_trans */
394+
__skb_push(skb, ETH_HLEN);
395+
/* Depend on prior success packets started NAPI consumer via
396+
* __veth_xdp_flush(). Cancel TXQ stop if consumer stopped,
397+
* paired with empty check in veth_poll().
398+
*/
399+
netif_tx_stop_queue(txq);
400+
smp_mb__after_atomic();
401+
if (unlikely(__ptr_ring_empty(&rq->xdp_ring)))
402+
netif_tx_wake_queue(txq);
403+
break;
404+
case NET_RX_DROP: /* same as NET_XMIT_DROP */
382405
drop:
383406
atomic64_inc(&priv->dropped);
384407
ret = NET_XMIT_DROP;
408+
break;
409+
default:
410+
net_crit_ratelimited("%s(%s): Invalid return code(%d)",
411+
__func__, dev->name, ret);
385412
}
386-
387413
rcu_read_unlock();
388414

389415
return ret;
@@ -874,9 +900,17 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
874900
struct veth_xdp_tx_bq *bq,
875901
struct veth_stats *stats)
876902
{
903+
struct veth_priv *priv = netdev_priv(rq->dev);
904+
int queue_idx = rq->xdp_rxq.queue_index;
905+
struct netdev_queue *peer_txq;
906+
struct net_device *peer_dev;
877907
int i, done = 0, n_xdpf = 0;
878908
void *xdpf[VETH_XDP_BATCH];
879909

910+
/* NAPI functions as RCU section */
911+
peer_dev = rcu_dereference_check(priv->peer, rcu_read_lock_bh_held());
912+
peer_txq = netdev_get_tx_queue(peer_dev, queue_idx);
913+
880914
for (i = 0; i < budget; i++) {
881915
void *ptr = __ptr_ring_consume(&rq->xdp_ring);
882916

@@ -925,6 +959,9 @@ static int veth_xdp_rcv(struct veth_rq *rq, int budget,
925959
rq->stats.vs.xdp_packets += done;
926960
u64_stats_update_end(&rq->stats.syncp);
927961

962+
if (unlikely(netif_tx_queue_stopped(peer_txq)))
963+
netif_tx_wake_queue(peer_txq);
964+
928965
return done;
929966
}
930967

0 commit comments

Comments
 (0)