Skip to content

Commit 0ebc27a

Browse files
Fernando Fernandez Mancerakuba-moo
authored andcommitted
xsk: avoid data corruption on cq descriptor number
Since commit 30f241f ("xsk: Fix immature cq descriptor production"), the descriptor number is stored in skb control block and xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto pool's completion queue. skb control block shouldn't be used for this purpose as after transmit xsk doesn't have control over it and other subsystems could use it. This leads to the following kernel panic due to a NULL pointer dereference. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 RIP: 0010:xsk_destruct_skb+0xd0/0x180 [...] Call Trace: <IRQ> ? napi_complete_done+0x7a/0x1a0 ip_rcv_core+0x1bb/0x340 ip_rcv+0x30/0x1f0 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0x87/0x130 __napi_poll+0x28/0x180 net_rx_action+0x339/0x420 handle_softirqs+0xdc/0x320 ? handle_edge_irq+0x90/0x1e0 do_softirq.part.0+0x3b/0x60 </IRQ> <TASK> __local_bh_enable_ip+0x60/0x70 __dev_direct_xmit+0x14e/0x1f0 __xsk_generic_xmit+0x482/0xb70 ? __remove_hrtimer+0x41/0xa0 ? __xsk_generic_xmit+0x51/0xb70 ? _raw_spin_unlock_irqrestore+0xe/0x40 xsk_sendmsg+0xda/0x1c0 __sys_sendto+0x1ee/0x200 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x84/0x2f0 ? __pfx_pollwake+0x10/0x10 ? __rseq_handle_notify_resume+0xad/0x4c0 ? restore_fpregs_from_fpstate+0x3c/0x90 ? switch_fpu_return+0x5b/0xe0 ? do_syscall_64+0x204/0x2f0 ? do_syscall_64+0x204/0x2f0 ? do_syscall_64+0x204/0x2f0 entry_SYSCALL_64_after_hwframe+0x76/0x7e </TASK> [...] Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Instead use the skb destructor_arg pointer along with pointer tagging. As pointers are always aligned to 8B, use the bottom bit to indicate whether this a single address or an allocated struct containing several addresses. Fixes: 30f241f ("xsk: Fix immature cq descriptor production") Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/ Suggested-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de> Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Reviewed-by: Jason Xing <kerneljasonxing@gmail.com> Link: https://patch.msgid.link/20251124171409.3845-1-fmancera@suse.de Signed-off-by: Jakub Kicinski <kuba@kernel.org>
1 parent ae1737e commit 0ebc27a

File tree

1 file changed

+88
-55
lines changed

1 file changed

+88
-55
lines changed

net/xdp/xsk.c

Lines changed: 88 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,13 @@
3636
#define TX_BATCH_SIZE 32
3737
#define MAX_PER_SOCKET_BUDGET 32
3838

39-
struct xsk_addr_node {
40-
u64 addr;
41-
struct list_head addr_node;
42-
};
43-
44-
struct xsk_addr_head {
39+
struct xsk_addrs {
4540
u32 num_descs;
46-
struct list_head addrs_list;
41+
u64 addrs[MAX_SKB_FRAGS + 1];
4742
};
4843

4944
static struct kmem_cache *xsk_tx_generic_cache;
5045

51-
#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
52-
5346
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
5447
{
5548
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -558,29 +551,68 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
558551
return ret;
559552
}
560553

554+
static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
555+
{
556+
return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
557+
}
558+
559+
static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
560+
{
561+
return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
562+
}
563+
564+
static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
565+
{
566+
skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
567+
}
568+
569+
static void xsk_inc_num_desc(struct sk_buff *skb)
570+
{
571+
struct xsk_addrs *xsk_addr;
572+
573+
if (!xsk_skb_destructor_is_addr(skb)) {
574+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
575+
xsk_addr->num_descs++;
576+
}
577+
}
578+
579+
static u32 xsk_get_num_desc(struct sk_buff *skb)
580+
{
581+
struct xsk_addrs *xsk_addr;
582+
583+
if (xsk_skb_destructor_is_addr(skb))
584+
return 1;
585+
586+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
587+
588+
return xsk_addr->num_descs;
589+
}
590+
561591
static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
562592
struct sk_buff *skb)
563593
{
564-
struct xsk_addr_node *pos, *tmp;
594+
u32 num_descs = xsk_get_num_desc(skb);
595+
struct xsk_addrs *xsk_addr;
565596
u32 descs_processed = 0;
566597
unsigned long flags;
567-
u32 idx;
598+
u32 idx, i;
568599

569600
spin_lock_irqsave(&pool->cq_lock, flags);
570601
idx = xskq_get_prod(pool->cq);
571602

572-
xskq_prod_write_addr(pool->cq, idx,
573-
(u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
574-
descs_processed++;
603+
if (unlikely(num_descs > 1)) {
604+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
575605

576-
if (unlikely(XSKCB(skb)->num_descs > 1)) {
577-
list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
606+
for (i = 0; i < num_descs; i++) {
578607
xskq_prod_write_addr(pool->cq, idx + descs_processed,
579-
pos->addr);
608+
xsk_addr->addrs[i]);
580609
descs_processed++;
581-
list_del(&pos->addr_node);
582-
kmem_cache_free(xsk_tx_generic_cache, pos);
583610
}
611+
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
612+
} else {
613+
xskq_prod_write_addr(pool->cq, idx,
614+
xsk_skb_destructor_get_addr(skb));
615+
descs_processed++;
584616
}
585617
xskq_prod_submit_n(pool->cq, descs_processed);
586618
spin_unlock_irqrestore(&pool->cq_lock, flags);
@@ -595,16 +627,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
595627
spin_unlock_irqrestore(&pool->cq_lock, flags);
596628
}
597629

598-
static void xsk_inc_num_desc(struct sk_buff *skb)
599-
{
600-
XSKCB(skb)->num_descs++;
601-
}
602-
603-
static u32 xsk_get_num_desc(struct sk_buff *skb)
604-
{
605-
return XSKCB(skb)->num_descs;
606-
}
607-
608630
static void xsk_destruct_skb(struct sk_buff *skb)
609631
{
610632
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -621,27 +643,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
621643
static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
622644
u64 addr)
623645
{
624-
BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
625-
INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
626646
skb->dev = xs->dev;
627647
skb->priority = READ_ONCE(xs->sk.sk_priority);
628648
skb->mark = READ_ONCE(xs->sk.sk_mark);
629-
XSKCB(skb)->num_descs = 0;
630649
skb->destructor = xsk_destruct_skb;
631-
skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
650+
xsk_skb_destructor_set_addr(skb, addr);
632651
}
633652

634653
static void xsk_consume_skb(struct sk_buff *skb)
635654
{
636655
struct xdp_sock *xs = xdp_sk(skb->sk);
637656
u32 num_descs = xsk_get_num_desc(skb);
638-
struct xsk_addr_node *pos, *tmp;
657+
struct xsk_addrs *xsk_addr;
639658

640659
if (unlikely(num_descs > 1)) {
641-
list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
642-
list_del(&pos->addr_node);
643-
kmem_cache_free(xsk_tx_generic_cache, pos);
644-
}
660+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
661+
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
645662
}
646663

647664
skb->destructor = sock_wfree;
@@ -701,7 +718,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
701718
{
702719
struct xsk_buff_pool *pool = xs->pool;
703720
u32 hr, len, ts, offset, copy, copied;
704-
struct xsk_addr_node *xsk_addr;
705721
struct sk_buff *skb = xs->skb;
706722
struct page *page;
707723
void *buffer;
@@ -727,16 +743,26 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
727743
return ERR_PTR(err);
728744
}
729745
} else {
730-
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
731-
if (!xsk_addr)
732-
return ERR_PTR(-ENOMEM);
746+
struct xsk_addrs *xsk_addr;
747+
748+
if (xsk_skb_destructor_is_addr(skb)) {
749+
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
750+
GFP_KERNEL);
751+
if (!xsk_addr)
752+
return ERR_PTR(-ENOMEM);
753+
754+
xsk_addr->num_descs = 1;
755+
xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
756+
skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
757+
} else {
758+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
759+
}
733760

734761
/* in case of -EOVERFLOW that could happen below,
735762
* xsk_consume_skb() will release this node as whole skb
736763
* would be dropped, which implies freeing all list elements
737764
*/
738-
xsk_addr->addr = desc->addr;
739-
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
765+
xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
740766
}
741767

742768
len = desc->len;
@@ -813,10 +839,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
813839
}
814840
} else {
815841
int nr_frags = skb_shinfo(skb)->nr_frags;
816-
struct xsk_addr_node *xsk_addr;
842+
struct xsk_addrs *xsk_addr;
817843
struct page *page;
818844
u8 *vaddr;
819845

846+
if (xsk_skb_destructor_is_addr(skb)) {
847+
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
848+
GFP_KERNEL);
849+
if (!xsk_addr) {
850+
err = -ENOMEM;
851+
goto free_err;
852+
}
853+
854+
xsk_addr->num_descs = 1;
855+
xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
856+
skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
857+
} else {
858+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
859+
}
860+
820861
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
821862
err = -EOVERFLOW;
822863
goto free_err;
@@ -828,22 +869,14 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
828869
goto free_err;
829870
}
830871

831-
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
832-
if (!xsk_addr) {
833-
__free_page(page);
834-
err = -ENOMEM;
835-
goto free_err;
836-
}
837-
838872
vaddr = kmap_local_page(page);
839873
memcpy(vaddr, buffer, len);
840874
kunmap_local(vaddr);
841875

842876
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
843877
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
844878

845-
xsk_addr->addr = desc->addr;
846-
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
879+
xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
847880
}
848881
}
849882

@@ -1904,7 +1937,7 @@ static int __init xsk_init(void)
19041937
goto out_pernet;
19051938

19061939
xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
1907-
sizeof(struct xsk_addr_node),
1940+
sizeof(struct xsk_addrs),
19081941
0, SLAB_HWCACHE_ALIGN, NULL);
19091942
if (!xsk_tx_generic_cache) {
19101943
err = -ENOMEM;

0 commit comments

Comments
 (0)