Skip to content

Commit 8df8e06

Browse files
magnus-karlssonkernel-patches-bot
authored andcommitted
i40e: use batched xsk Tx interfaces to increase performance
Use the new batched xsk interfaces for the Tx path in the i40e driver to improve performance. On my machine, this yields a throughput increase of 4% for the l2fwd sample app in xdpsock. If we instead just look at the Tx part, this patch set increases throughput with above 20% for Tx. Note that I had to explicitly loop unroll the inner loop to get to this performance level, by using a pragma. It is honored by both clang and gcc and should be ignored by versions that do not support it. Using the -funroll-loops compiler command line switch on the source file resulted in a loop unrolling on a higher level that lead to a performance decrease instead of an increase. Signed-off-by: Magnus Karlsson <magnus.karlsson@intel.com> Acked-by: John Fastabend <john.fastabend@gmail.com>
1 parent f2586be commit 8df8e06

File tree

3 files changed

+104
-35
lines changed

3 files changed

+104
-35
lines changed

drivers/net/ethernet/intel/i40e/i40e_txrx.c

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -676,6 +676,8 @@ void i40e_free_tx_resources(struct i40e_ring *tx_ring)
676676
i40e_clean_tx_ring(tx_ring);
677677
kfree(tx_ring->tx_bi);
678678
tx_ring->tx_bi = NULL;
679+
kfree(tx_ring->xsk_descs);
680+
tx_ring->xsk_descs = NULL;
679681

680682
if (tx_ring->desc) {
681683
dma_free_coherent(tx_ring->dev, tx_ring->size,
@@ -1277,6 +1279,13 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
12771279
if (!tx_ring->tx_bi)
12781280
goto err;
12791281

1282+
if (ring_is_xdp(tx_ring)) {
1283+
tx_ring->xsk_descs = kcalloc(I40E_MAX_NUM_DESCRIPTORS, sizeof(*tx_ring->xsk_descs),
1284+
GFP_KERNEL);
1285+
if (!tx_ring->xsk_descs)
1286+
goto err;
1287+
}
1288+
12801289
u64_stats_init(&tx_ring->syncp);
12811290

12821291
/* round up to nearest 4K */
@@ -1300,6 +1309,8 @@ int i40e_setup_tx_descriptors(struct i40e_ring *tx_ring)
13001309
return 0;
13011310

13021311
err:
1312+
kfree(tx_ring->xsk_descs);
1313+
tx_ring->xsk_descs = NULL;
13031314
kfree(tx_ring->tx_bi);
13041315
tx_ring->tx_bi = NULL;
13051316
return -ENOMEM;

drivers/net/ethernet/intel/i40e/i40e_txrx.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,7 @@ struct i40e_ring {
389389
struct i40e_channel *ch;
390390
struct xdp_rxq_info xdp_rxq;
391391
struct xsk_buff_pool *xsk_pool;
392+
struct xdp_desc *xsk_descs; /* For storing descriptors in the AF_XDP ZC path */
392393
} ____cacheline_internodealigned_in_smp;
393394

394395
static inline bool ring_uses_build_skb(struct i40e_ring *ring)

drivers/net/ethernet/intel/i40e/i40e_xsk.c

Lines changed: 92 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -381,6 +381,78 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
381381
return failure ? budget : (int)total_rx_packets;
382382
}
383383

384+
static void i40e_xmit_pkt(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
385+
unsigned int *total_bytes)
386+
{
387+
struct i40e_tx_desc *tx_desc;
388+
dma_addr_t dma;
389+
390+
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
391+
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
392+
393+
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
394+
tx_desc->buffer_addr = cpu_to_le64(dma);
395+
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC | I40E_TX_DESC_CMD_EOP,
396+
0, desc->len, 0);
397+
398+
*total_bytes += desc->len;
399+
}
400+
401+
/* This value should match the pragma below. Why 4? It is strictly
402+
* empirical. It seems to be a good compromise between the advantage
403+
* of having simultaneous outstanding reads to the DMA array that can
404+
* hide each others latency and the disadvantage of having a larger
405+
* code path.
406+
*/
407+
#define PKTS_PER_BATCH 4
408+
409+
static void i40e_xmit_pkt_batch(struct i40e_ring *xdp_ring, struct xdp_desc *desc,
410+
unsigned int *total_bytes)
411+
{
412+
u16 ntu = xdp_ring->next_to_use;
413+
struct i40e_tx_desc *tx_desc;
414+
dma_addr_t dma;
415+
u32 i;
416+
417+
#pragma GCC unroll 4
418+
for (i = 0; i < PKTS_PER_BATCH; i++) {
419+
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc[i].addr);
420+
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc[i].len);
421+
422+
tx_desc = I40E_TX_DESC(xdp_ring, ntu++);
423+
tx_desc->buffer_addr = cpu_to_le64(dma);
424+
tx_desc->cmd_type_offset_bsz = build_ctob(I40E_TX_DESC_CMD_ICRC |
425+
I40E_TX_DESC_CMD_EOP,
426+
0, desc[i].len, 0);
427+
428+
*total_bytes += desc[i].len;
429+
}
430+
431+
xdp_ring->next_to_use = ntu;
432+
}
433+
434+
static void i40e_fill_tx_hw_ring(struct i40e_ring *xdp_ring, struct xdp_desc *descs, u32 nb_pkts,
435+
unsigned int *total_bytes)
436+
{
437+
u32 batched, leftover, i;
438+
439+
batched = nb_pkts & ~(PKTS_PER_BATCH - 1);
440+
leftover = nb_pkts & (PKTS_PER_BATCH - 1);
441+
for (i = 0; i < batched; i += PKTS_PER_BATCH)
442+
i40e_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
443+
for (i = batched; i < batched + leftover; i++)
444+
i40e_xmit_pkt(xdp_ring, &descs[i], total_bytes);
445+
}
446+
447+
static void i40e_set_rs_bit(struct i40e_ring *xdp_ring)
448+
{
449+
u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : xdp_ring->count - 1;
450+
struct i40e_tx_desc *tx_desc;
451+
452+
tx_desc = I40E_TX_DESC(xdp_ring, ntu);
453+
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS << I40E_TXD_QW1_CMD_SHIFT);
454+
}
455+
384456
/**
385457
* i40e_xmit_zc - Performs zero-copy Tx AF_XDP
386458
* @xdp_ring: XDP Tx ring
@@ -390,45 +462,30 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
390462
**/
391463
static bool i40e_xmit_zc(struct i40e_ring *xdp_ring, unsigned int budget)
392464
{
393-
unsigned int sent_frames = 0, total_bytes = 0;
394-
struct i40e_tx_desc *tx_desc = NULL;
395-
struct xdp_desc desc;
396-
dma_addr_t dma;
397-
398-
while (budget-- > 0) {
399-
if (!xsk_tx_peek_desc(xdp_ring->xsk_pool, &desc))
400-
break;
401-
402-
dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc.addr);
403-
xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
404-
desc.len);
405-
406-
tx_desc = I40E_TX_DESC(xdp_ring, xdp_ring->next_to_use);
407-
tx_desc->buffer_addr = cpu_to_le64(dma);
408-
tx_desc->cmd_type_offset_bsz =
409-
build_ctob(I40E_TX_DESC_CMD_ICRC
410-
| I40E_TX_DESC_CMD_EOP,
411-
0, desc.len, 0);
412-
413-
sent_frames++;
414-
total_bytes += desc.len;
415-
416-
xdp_ring->next_to_use++;
417-
if (xdp_ring->next_to_use == xdp_ring->count)
418-
xdp_ring->next_to_use = 0;
465+
struct xdp_desc *descs = xdp_ring->xsk_descs;
466+
u32 nb_pkts, nb_processed = 0;
467+
unsigned int total_bytes = 0;
468+
469+
nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, descs, budget);
470+
if (!nb_pkts)
471+
return false;
472+
473+
if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
474+
nb_processed = xdp_ring->count - xdp_ring->next_to_use;
475+
i40e_fill_tx_hw_ring(xdp_ring, descs, nb_processed, &total_bytes);
476+
xdp_ring->next_to_use = 0;
419477
}
420478

421-
if (tx_desc) {
422-
/* Request an interrupt for the last frame and bump tail ptr. */
423-
tx_desc->cmd_type_offset_bsz |= (I40E_TX_DESC_CMD_RS <<
424-
I40E_TXD_QW1_CMD_SHIFT);
425-
i40e_xdp_ring_update_tail(xdp_ring);
479+
i40e_fill_tx_hw_ring(xdp_ring, &descs[nb_processed], nb_pkts - nb_processed,
480+
&total_bytes);
426481

427-
xsk_tx_release(xdp_ring->xsk_pool);
428-
i40e_update_tx_stats(xdp_ring, sent_frames, total_bytes);
429-
}
482+
/* Request an interrupt for the last frame and bump tail ptr. */
483+
i40e_set_rs_bit(xdp_ring);
484+
i40e_xdp_ring_update_tail(xdp_ring);
485+
486+
i40e_update_tx_stats(xdp_ring, nb_pkts, total_bytes);
430487

431-
return !!budget;
488+
return true;
432489
}
433490

434491
/**

0 commit comments

Comments
 (0)