Skip to content

Commit c5ea2e5

Browse files
Fernando Fernandez Manceragregkh
authored andcommitted
xsk: avoid data corruption on cq descriptor number
[ Upstream commit 0ebc27a ] Since commit 30f241f ("xsk: Fix immature cq descriptor production"), the descriptor number is stored in skb control block and xsk_cq_submit_addr_locked() relies on it to put the umem addrs onto pool's completion queue. skb control block shouldn't be used for this purpose as after transmit xsk doesn't have control over it and other subsystems could use it. This leads to the following kernel panic due to a NULL pointer dereference. BUG: kernel NULL pointer dereference, address: 0000000000000000 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 2 UID: 1 PID: 927 Comm: p4xsk.bin Not tainted 6.16.12+deb14-cloud-amd64 #1 PREEMPT(lazy) Debian 6.16.12-1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.17.0-debian-1.17.0-1 04/01/2014 RIP: 0010:xsk_destruct_skb+0xd0/0x180 [...] Call Trace: <IRQ> ? napi_complete_done+0x7a/0x1a0 ip_rcv_core+0x1bb/0x340 ip_rcv+0x30/0x1f0 __netif_receive_skb_one_core+0x85/0xa0 process_backlog+0x87/0x130 __napi_poll+0x28/0x180 net_rx_action+0x339/0x420 handle_softirqs+0xdc/0x320 ? handle_edge_irq+0x90/0x1e0 do_softirq.part.0+0x3b/0x60 </IRQ> <TASK> __local_bh_enable_ip+0x60/0x70 __dev_direct_xmit+0x14e/0x1f0 __xsk_generic_xmit+0x482/0xb70 ? __remove_hrtimer+0x41/0xa0 ? __xsk_generic_xmit+0x51/0xb70 ? _raw_spin_unlock_irqrestore+0xe/0x40 xsk_sendmsg+0xda/0x1c0 __sys_sendto+0x1ee/0x200 __x64_sys_sendto+0x24/0x30 do_syscall_64+0x84/0x2f0 ? __pfx_pollwake+0x10/0x10 ? __rseq_handle_notify_resume+0xad/0x4c0 ? restore_fpregs_from_fpstate+0x3c/0x90 ? switch_fpu_return+0x5b/0xe0 ? do_syscall_64+0x204/0x2f0 ? do_syscall_64+0x204/0x2f0 ? do_syscall_64+0x204/0x2f0 entry_SYSCALL_64_after_hwframe+0x76/0x7e </TASK> [...] Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: 0x1c000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) Instead use the skb destructor_arg pointer along with pointer tagging. As pointers are always aligned to 8B, use the bottom bit to indicate whether this a single address or an allocated struct containing several addresses. Fixes: 30f241f ("xsk: Fix immature cq descriptor production") Closes: https://lore.kernel.org/netdev/0435b904-f44f-48f8-afb0-68868474bf1c@nop.hu/ Suggested-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Fernando Fernandez Mancera <fmancera@suse.de> Reviewed-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com> Reviewed-by: Jason Xing <kerneljasonxing@gmail.com> Link: https://patch.msgid.link/20251124171409.3845-1-fmancera@suse.de Signed-off-by: Jakub Kicinski <kuba@kernel.org> Signed-off-by: Sasha Levin <sashal@kernel.org>
1 parent 49d2cea commit c5ea2e5

File tree

1 file changed

+88
-55
lines changed

1 file changed

+88
-55
lines changed

net/xdp/xsk.c

Lines changed: 88 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -36,20 +36,13 @@
3636
#define TX_BATCH_SIZE 32
3737
#define MAX_PER_SOCKET_BUDGET 32
3838

39-
struct xsk_addr_node {
40-
u64 addr;
41-
struct list_head addr_node;
42-
};
43-
44-
struct xsk_addr_head {
39+
struct xsk_addrs {
4540
u32 num_descs;
46-
struct list_head addrs_list;
41+
u64 addrs[MAX_SKB_FRAGS + 1];
4742
};
4843

4944
static struct kmem_cache *xsk_tx_generic_cache;
5045

51-
#define XSKCB(skb) ((struct xsk_addr_head *)((skb)->cb))
52-
5346
void xsk_set_rx_need_wakeup(struct xsk_buff_pool *pool)
5447
{
5548
if (pool->cached_need_wakeup & XDP_WAKEUP_RX)
@@ -558,29 +551,68 @@ static int xsk_cq_reserve_locked(struct xsk_buff_pool *pool)
558551
return ret;
559552
}
560553

554+
static bool xsk_skb_destructor_is_addr(struct sk_buff *skb)
555+
{
556+
return (uintptr_t)skb_shinfo(skb)->destructor_arg & 0x1UL;
557+
}
558+
559+
static u64 xsk_skb_destructor_get_addr(struct sk_buff *skb)
560+
{
561+
return (u64)((uintptr_t)skb_shinfo(skb)->destructor_arg & ~0x1UL);
562+
}
563+
564+
static void xsk_skb_destructor_set_addr(struct sk_buff *skb, u64 addr)
565+
{
566+
skb_shinfo(skb)->destructor_arg = (void *)((uintptr_t)addr | 0x1UL);
567+
}
568+
569+
static void xsk_inc_num_desc(struct sk_buff *skb)
570+
{
571+
struct xsk_addrs *xsk_addr;
572+
573+
if (!xsk_skb_destructor_is_addr(skb)) {
574+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
575+
xsk_addr->num_descs++;
576+
}
577+
}
578+
579+
static u32 xsk_get_num_desc(struct sk_buff *skb)
580+
{
581+
struct xsk_addrs *xsk_addr;
582+
583+
if (xsk_skb_destructor_is_addr(skb))
584+
return 1;
585+
586+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
587+
588+
return xsk_addr->num_descs;
589+
}
590+
561591
static void xsk_cq_submit_addr_locked(struct xsk_buff_pool *pool,
562592
struct sk_buff *skb)
563593
{
564-
struct xsk_addr_node *pos, *tmp;
594+
u32 num_descs = xsk_get_num_desc(skb);
595+
struct xsk_addrs *xsk_addr;
565596
u32 descs_processed = 0;
566597
unsigned long flags;
567-
u32 idx;
598+
u32 idx, i;
568599

569600
spin_lock_irqsave(&pool->cq_lock, flags);
570601
idx = xskq_get_prod(pool->cq);
571602

572-
xskq_prod_write_addr(pool->cq, idx,
573-
(u64)(uintptr_t)skb_shinfo(skb)->destructor_arg);
574-
descs_processed++;
603+
if (unlikely(num_descs > 1)) {
604+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
575605

576-
if (unlikely(XSKCB(skb)->num_descs > 1)) {
577-
list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
606+
for (i = 0; i < num_descs; i++) {
578607
xskq_prod_write_addr(pool->cq, idx + descs_processed,
579-
pos->addr);
608+
xsk_addr->addrs[i]);
580609
descs_processed++;
581-
list_del(&pos->addr_node);
582-
kmem_cache_free(xsk_tx_generic_cache, pos);
583610
}
611+
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
612+
} else {
613+
xskq_prod_write_addr(pool->cq, idx,
614+
xsk_skb_destructor_get_addr(skb));
615+
descs_processed++;
584616
}
585617
xskq_prod_submit_n(pool->cq, descs_processed);
586618
spin_unlock_irqrestore(&pool->cq_lock, flags);
@@ -595,16 +627,6 @@ static void xsk_cq_cancel_locked(struct xsk_buff_pool *pool, u32 n)
595627
spin_unlock_irqrestore(&pool->cq_lock, flags);
596628
}
597629

598-
static void xsk_inc_num_desc(struct sk_buff *skb)
599-
{
600-
XSKCB(skb)->num_descs++;
601-
}
602-
603-
static u32 xsk_get_num_desc(struct sk_buff *skb)
604-
{
605-
return XSKCB(skb)->num_descs;
606-
}
607-
608630
static void xsk_destruct_skb(struct sk_buff *skb)
609631
{
610632
struct xsk_tx_metadata_compl *compl = &skb_shinfo(skb)->xsk_meta;
@@ -621,27 +643,22 @@ static void xsk_destruct_skb(struct sk_buff *skb)
621643
static void xsk_skb_init_misc(struct sk_buff *skb, struct xdp_sock *xs,
622644
u64 addr)
623645
{
624-
BUILD_BUG_ON(sizeof(struct xsk_addr_head) > sizeof(skb->cb));
625-
INIT_LIST_HEAD(&XSKCB(skb)->addrs_list);
626646
skb->dev = xs->dev;
627647
skb->priority = READ_ONCE(xs->sk.sk_priority);
628648
skb->mark = READ_ONCE(xs->sk.sk_mark);
629-
XSKCB(skb)->num_descs = 0;
630649
skb->destructor = xsk_destruct_skb;
631-
skb_shinfo(skb)->destructor_arg = (void *)(uintptr_t)addr;
650+
xsk_skb_destructor_set_addr(skb, addr);
632651
}
633652

634653
static void xsk_consume_skb(struct sk_buff *skb)
635654
{
636655
struct xdp_sock *xs = xdp_sk(skb->sk);
637656
u32 num_descs = xsk_get_num_desc(skb);
638-
struct xsk_addr_node *pos, *tmp;
657+
struct xsk_addrs *xsk_addr;
639658

640659
if (unlikely(num_descs > 1)) {
641-
list_for_each_entry_safe(pos, tmp, &XSKCB(skb)->addrs_list, addr_node) {
642-
list_del(&pos->addr_node);
643-
kmem_cache_free(xsk_tx_generic_cache, pos);
644-
}
660+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
661+
kmem_cache_free(xsk_tx_generic_cache, xsk_addr);
645662
}
646663

647664
skb->destructor = sock_wfree;
@@ -662,7 +679,6 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
662679
{
663680
struct xsk_buff_pool *pool = xs->pool;
664681
u32 hr, len, ts, offset, copy, copied;
665-
struct xsk_addr_node *xsk_addr;
666682
struct sk_buff *skb = xs->skb;
667683
struct page *page;
668684
void *buffer;
@@ -680,16 +696,26 @@ static struct sk_buff *xsk_build_skb_zerocopy(struct xdp_sock *xs,
680696

681697
xsk_skb_init_misc(skb, xs, desc->addr);
682698
} else {
683-
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
684-
if (!xsk_addr)
685-
return ERR_PTR(-ENOMEM);
699+
struct xsk_addrs *xsk_addr;
700+
701+
if (xsk_skb_destructor_is_addr(skb)) {
702+
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
703+
GFP_KERNEL);
704+
if (!xsk_addr)
705+
return ERR_PTR(-ENOMEM);
706+
707+
xsk_addr->num_descs = 1;
708+
xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
709+
skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
710+
} else {
711+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
712+
}
686713

687714
/* in case of -EOVERFLOW that could happen below,
688715
* xsk_consume_skb() will release this node as whole skb
689716
* would be dropped, which implies freeing all list elements
690717
*/
691-
xsk_addr->addr = desc->addr;
692-
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
718+
xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
693719
}
694720

695721
addr = desc->addr;
@@ -765,10 +791,25 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
765791
xsk_skb_init_misc(skb, xs, desc->addr);
766792
} else {
767793
int nr_frags = skb_shinfo(skb)->nr_frags;
768-
struct xsk_addr_node *xsk_addr;
794+
struct xsk_addrs *xsk_addr;
769795
struct page *page;
770796
u8 *vaddr;
771797

798+
if (xsk_skb_destructor_is_addr(skb)) {
799+
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache,
800+
GFP_KERNEL);
801+
if (!xsk_addr) {
802+
err = -ENOMEM;
803+
goto free_err;
804+
}
805+
806+
xsk_addr->num_descs = 1;
807+
xsk_addr->addrs[0] = xsk_skb_destructor_get_addr(skb);
808+
skb_shinfo(skb)->destructor_arg = (void *)xsk_addr;
809+
} else {
810+
xsk_addr = (struct xsk_addrs *)skb_shinfo(skb)->destructor_arg;
811+
}
812+
772813
if (unlikely(nr_frags == (MAX_SKB_FRAGS - 1) && xp_mb_desc(desc))) {
773814
err = -EOVERFLOW;
774815
goto free_err;
@@ -780,22 +821,14 @@ static struct sk_buff *xsk_build_skb(struct xdp_sock *xs,
780821
goto free_err;
781822
}
782823

783-
xsk_addr = kmem_cache_zalloc(xsk_tx_generic_cache, GFP_KERNEL);
784-
if (!xsk_addr) {
785-
__free_page(page);
786-
err = -ENOMEM;
787-
goto free_err;
788-
}
789-
790824
vaddr = kmap_local_page(page);
791825
memcpy(vaddr, buffer, len);
792826
kunmap_local(vaddr);
793827

794828
skb_add_rx_frag(skb, nr_frags, page, 0, len, PAGE_SIZE);
795829
refcount_add(PAGE_SIZE, &xs->sk.sk_wmem_alloc);
796830

797-
xsk_addr->addr = desc->addr;
798-
list_add_tail(&xsk_addr->addr_node, &XSKCB(skb)->addrs_list);
831+
xsk_addr->addrs[xsk_addr->num_descs] = desc->addr;
799832
}
800833

801834
if (first_frag && desc->options & XDP_TX_METADATA) {
@@ -1892,7 +1925,7 @@ static int __init xsk_init(void)
18921925
goto out_pernet;
18931926

18941927
xsk_tx_generic_cache = kmem_cache_create("xsk_generic_xmit_cache",
1895-
sizeof(struct xsk_addr_node),
1928+
sizeof(struct xsk_addrs),
18961929
0, SLAB_HWCACHE_ALIGN, NULL);
18971930
if (!xsk_tx_generic_cache) {
18981931
err = -ENOMEM;

0 commit comments

Comments
 (0)