@@ -381,6 +381,78 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
381
381
return failure ? budget : (int )total_rx_packets ;
382
382
}
383
383
384
+ static void i40e_xmit_pkt (struct i40e_ring * xdp_ring , struct xdp_desc * desc ,
385
+ unsigned int * total_bytes )
386
+ {
387
+ struct i40e_tx_desc * tx_desc ;
388
+ dma_addr_t dma ;
389
+
390
+ dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc -> addr );
391
+ xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma , desc -> len );
392
+
393
+ tx_desc = I40E_TX_DESC (xdp_ring , xdp_ring -> next_to_use ++ );
394
+ tx_desc -> buffer_addr = cpu_to_le64 (dma );
395
+ tx_desc -> cmd_type_offset_bsz = build_ctob (I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP ,
396
+ 0 , desc -> len , 0 );
397
+
398
+ * total_bytes += desc -> len ;
399
+ }
400
+
401
+ /* This value should match the pragma below. Why 4? It is strictly
402
+ * empirical. It seems to be a good compromise between the advantage
403
+ * of having simultaneous outstanding reads to the DMA array that can
404
+ * hide each others latency and the disadvantage of having a larger
405
+ * code path.
406
+ */
407
+ #define PKTS_PER_BATCH 4
408
+
409
+ static void i40e_xmit_pkt_batch (struct i40e_ring * xdp_ring , struct xdp_desc * desc ,
410
+ unsigned int * total_bytes )
411
+ {
412
+ u16 ntu = xdp_ring -> next_to_use ;
413
+ struct i40e_tx_desc * tx_desc ;
414
+ dma_addr_t dma ;
415
+ u32 i ;
416
+
417
+ #pragma GCC unroll 4
418
+ for (i = 0 ; i < PKTS_PER_BATCH ; i ++ ) {
419
+ dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc [i ].addr );
420
+ xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma , desc [i ].len );
421
+
422
+ tx_desc = I40E_TX_DESC (xdp_ring , ntu ++ );
423
+ tx_desc -> buffer_addr = cpu_to_le64 (dma );
424
+ tx_desc -> cmd_type_offset_bsz = build_ctob (I40E_TX_DESC_CMD_ICRC |
425
+ I40E_TX_DESC_CMD_EOP ,
426
+ 0 , desc [i ].len , 0 );
427
+
428
+ * total_bytes += desc [i ].len ;
429
+ }
430
+
431
+ xdp_ring -> next_to_use = ntu ;
432
+ }
433
+
434
+ static void i40e_fill_tx_hw_ring (struct i40e_ring * xdp_ring , struct xdp_desc * descs , u32 nb_pkts ,
435
+ unsigned int * total_bytes )
436
+ {
437
+ u32 batched , leftover , i ;
438
+
439
+ batched = nb_pkts & ~(PKTS_PER_BATCH - 1 );
440
+ leftover = nb_pkts & (PKTS_PER_BATCH - 1 );
441
+ for (i = 0 ; i < batched ; i += PKTS_PER_BATCH )
442
+ i40e_xmit_pkt_batch (xdp_ring , & descs [i ], total_bytes );
443
+ for (i = batched ; i < batched + leftover ; i ++ )
444
+ i40e_xmit_pkt (xdp_ring , & descs [i ], total_bytes );
445
+ }
446
+
447
+ static void i40e_set_rs_bit (struct i40e_ring * xdp_ring )
448
+ {
449
+ u16 ntu = xdp_ring -> next_to_use ? xdp_ring -> next_to_use - 1 : xdp_ring -> count - 1 ;
450
+ struct i40e_tx_desc * tx_desc ;
451
+
452
+ tx_desc = I40E_TX_DESC (xdp_ring , ntu );
453
+ tx_desc -> cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT );
454
+ }
455
+
384
456
/**
385
457
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
386
458
* @xdp_ring: XDP Tx ring
@@ -390,45 +462,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
390
462
**/
391
463
static bool i40e_xmit_zc (struct i40e_ring * xdp_ring , unsigned int budget )
392
464
{
393
- unsigned int sent_frames = 0 , total_bytes = 0 ;
394
- struct i40e_tx_desc * tx_desc = NULL ;
395
- struct xdp_desc desc ;
396
- dma_addr_t dma ;
397
-
398
- while (budget -- > 0 ) {
399
- if (!xsk_tx_peek_desc (xdp_ring -> xsk_pool , & desc ))
400
- break ;
401
-
402
- dma = xsk_buff_raw_get_dma (xdp_ring -> xsk_pool , desc .addr );
403
- xsk_buff_raw_dma_sync_for_device (xdp_ring -> xsk_pool , dma ,
404
- desc .len );
405
-
406
- tx_desc = I40E_TX_DESC (xdp_ring , xdp_ring -> next_to_use );
407
- tx_desc -> buffer_addr = cpu_to_le64 (dma );
408
- tx_desc -> cmd_type_offset_bsz =
409
- build_ctob (I40E_TX_DESC_CMD_ICRC
410
- | I40E_TX_DESC_CMD_EOP ,
411
- 0 , desc .len , 0 );
412
-
413
- sent_frames ++ ;
414
- total_bytes += desc .len ;
415
-
416
- xdp_ring -> next_to_use ++ ;
417
- if (xdp_ring -> next_to_use == xdp_ring -> count )
418
- xdp_ring -> next_to_use = 0 ;
465
+ struct xdp_desc * descs = xdp_ring -> xsk_descs ;
466
+ u32 nb_pkts , nb_processed = 0 ;
467
+ unsigned int total_bytes = 0 ;
468
+
469
+ nb_pkts = xsk_tx_peek_release_desc_batch (xdp_ring -> xsk_pool , descs , budget );
470
+ if (!nb_pkts )
471
+ return false;
472
+
473
+ if (xdp_ring -> next_to_use + nb_pkts >= xdp_ring -> count ) {
474
+ nb_processed = xdp_ring -> count - xdp_ring -> next_to_use ;
475
+ i40e_fill_tx_hw_ring (xdp_ring , descs , nb_processed , & total_bytes );
476
+ xdp_ring -> next_to_use = 0 ;
419
477
}
420
478
421
- if (tx_desc ) {
422
- /* Request an interrupt for the last frame and bump tail ptr. */
423
- tx_desc -> cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
424
- I40E_TXD_QW1_CMD_SHIFT );
425
- i40e_xdp_ring_update_tail (xdp_ring );
479
+ i40e_fill_tx_hw_ring (xdp_ring , & descs [nb_processed ], nb_pkts - nb_processed ,
480
+ & total_bytes );
426
481
427
- xsk_tx_release (xdp_ring -> xsk_pool );
428
- i40e_update_tx_stats (xdp_ring , sent_frames , total_bytes );
429
- }
482
+ /* Request an interrupt for the last frame and bump tail ptr. */
483
+ i40e_set_rs_bit (xdp_ring );
484
+ i40e_xdp_ring_update_tail (xdp_ring );
485
+
486
+ i40e_update_tx_stats (xdp_ring , nb_pkts , total_bytes );
430
487
431
- return !! budget ;
488
+ return true ;
432
489
}
433
490
434
491
/**
0 commit comments