Skip to content

Commit 383d9f8

Browse files
committed
Merge branch 'net-core-use-a-dedicated-kmem_cache-for-skb-head-allocs'
Eric Dumazet says: ==================== net: core: use a dedicated kmem_cache for skb head allocs Our profile data show that using kmalloc(non_const_size)/kfree(ptr) has a certain cost, because kfree(ptr) has to pull a 'struct page' in cpu caches. Using a dedicated kmem_cache for TCP skb->head allocations makes a difference, both in cpu cycles and memory savings. This kmem_cache could also be used for GRO skb allocations, this is left as a future exercise. ==================== Link: https://lore.kernel.org/r/20230206173103.2617121-1-edumazet@google.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 61d731e + bf9f1ba commit 383d9f8

File tree

2 files changed

+90
-33
lines changed

2 files changed

+90
-33
lines changed

include/linux/skbuff.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,14 @@
255255
#define SKB_DATA_ALIGN(X) ALIGN(X, SMP_CACHE_BYTES)
256256
#define SKB_WITH_OVERHEAD(X) \
257257
((X) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
258+
259+
/* For X bytes available in skb->head, what is the minimal
260+
* allocation needed, knowing struct skb_shared_info needs
261+
* to be aligned.
262+
*/
263+
#define SKB_HEAD_ALIGN(X) (SKB_DATA_ALIGN(X) + \
264+
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))
265+
258266
#define SKB_MAX_ORDER(X, ORDER) \
259267
SKB_WITH_OVERHEAD((PAGE_SIZE << (ORDER)) - (X))
260268
#define SKB_MAX_HEAD(X) (SKB_MAX_ORDER((X), 0))

net/core/skbuff.c

Lines changed: 82 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,34 @@ static struct kmem_cache *skbuff_fclone_cache __ro_after_init;
8989
#ifdef CONFIG_SKB_EXTENSIONS
9090
static struct kmem_cache *skbuff_ext_cache __ro_after_init;
9191
#endif
92+
93+
/* skb_small_head_cache and related code is only supported
94+
* for CONFIG_SLAB and CONFIG_SLUB.
95+
* As soon as SLOB is removed from the kernel, we can clean up this.
96+
*/
97+
#if !defined(CONFIG_SLOB)
98+
# define HAVE_SKB_SMALL_HEAD_CACHE 1
99+
#endif
100+
101+
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
102+
static struct kmem_cache *skb_small_head_cache __ro_after_init;
103+
104+
#define SKB_SMALL_HEAD_SIZE SKB_HEAD_ALIGN(MAX_TCP_HEADER)
105+
106+
/* We want SKB_SMALL_HEAD_CACHE_SIZE to not be a power of two.
107+
* This should ensure that SKB_SMALL_HEAD_HEADROOM is a unique
108+
* size, and we can differentiate heads from skb_small_head_cache
109+
* vs system slabs by looking at their size (skb_end_offset()).
110+
*/
111+
#define SKB_SMALL_HEAD_CACHE_SIZE \
112+
(is_power_of_2(SKB_SMALL_HEAD_SIZE) ? \
113+
(SKB_SMALL_HEAD_SIZE + L1_CACHE_BYTES) : \
114+
SKB_SMALL_HEAD_SIZE)
115+
116+
#define SKB_SMALL_HEAD_HEADROOM \
117+
SKB_WITH_OVERHEAD(SKB_SMALL_HEAD_CACHE_SIZE)
118+
#endif /* HAVE_SKB_SMALL_HEAD_CACHE */
119+
92120
int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
93121
EXPORT_SYMBOL(sysctl_max_skb_frags);
94122

@@ -478,25 +506,45 @@ EXPORT_SYMBOL(napi_build_skb);
478506
* may be used. Otherwise, the packet data may be discarded until enough
479507
* memory is free
480508
*/
481-
static void *kmalloc_reserve(size_t size, gfp_t flags, int node,
509+
static void *kmalloc_reserve(unsigned int *size, gfp_t flags, int node,
482510
bool *pfmemalloc)
483511
{
484-
void *obj;
485512
bool ret_pfmemalloc = false;
513+
unsigned int obj_size;
514+
void *obj;
486515

516+
obj_size = SKB_HEAD_ALIGN(*size);
517+
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
518+
if (obj_size <= SKB_SMALL_HEAD_CACHE_SIZE &&
519+
!(flags & KMALLOC_NOT_NORMAL_BITS)) {
520+
521+
/* skb_small_head_cache has non power of two size,
522+
* likely forcing SLUB to use order-3 pages.
523+
* We deliberately attempt a NOMEMALLOC allocation only.
524+
*/
525+
obj = kmem_cache_alloc_node(skb_small_head_cache,
526+
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
527+
node);
528+
if (obj) {
529+
*size = SKB_SMALL_HEAD_CACHE_SIZE;
530+
goto out;
531+
}
532+
}
533+
#endif
534+
*size = obj_size = kmalloc_size_roundup(obj_size);
487535
/*
488536
* Try a regular allocation, when that fails and we're not entitled
489537
* to the reserves, fail.
490538
*/
491-
obj = kmalloc_node_track_caller(size,
539+
obj = kmalloc_node_track_caller(obj_size,
492540
flags | __GFP_NOMEMALLOC | __GFP_NOWARN,
493541
node);
494542
if (obj || !(gfp_pfmemalloc_allowed(flags)))
495543
goto out;
496544

497545
/* Try again but now we are using pfmemalloc reserves */
498546
ret_pfmemalloc = true;
499-
obj = kmalloc_node_track_caller(size, flags, node);
547+
obj = kmalloc_node_track_caller(obj_size, flags, node);
500548

501549
out:
502550
if (pfmemalloc)
@@ -533,7 +581,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
533581
{
534582
struct kmem_cache *cache;
535583
struct sk_buff *skb;
536-
unsigned int osize;
537584
bool pfmemalloc;
538585
u8 *data;
539586

@@ -558,26 +605,22 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
558605
* aligned memory blocks, unless SLUB/SLAB debug is enabled.
559606
* Both skb->head and skb_shared_info are cache line aligned.
560607
*/
561-
size = SKB_DATA_ALIGN(size);
562-
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
563-
osize = kmalloc_size_roundup(size);
564-
data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc);
608+
data = kmalloc_reserve(&size, gfp_mask, node, &pfmemalloc);
565609
if (unlikely(!data))
566610
goto nodata;
567611
/* kmalloc_size_roundup() might give us more room than requested.
568612
* Put skb_shared_info exactly at the end of allocated zone,
569613
* to allow max possible filling before reallocation.
570614
*/
571-
size = SKB_WITH_OVERHEAD(osize);
572-
prefetchw(data + size);
615+
prefetchw(data + SKB_WITH_OVERHEAD(size));
573616

574617
/*
575618
* Only clear those fields we need to clear, not those that we will
576619
* actually initialise below. Hence, don't put any more fields after
577620
* the tail pointer in struct sk_buff!
578621
*/
579622
memset(skb, 0, offsetof(struct sk_buff, tail));
580-
__build_skb_around(skb, data, osize);
623+
__build_skb_around(skb, data, size);
581624
skb->pfmemalloc = pfmemalloc;
582625

583626
if (flags & SKB_ALLOC_FCLONE) {
@@ -632,8 +675,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
632675
goto skb_success;
633676
}
634677

635-
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
636-
len = SKB_DATA_ALIGN(len);
678+
len = SKB_HEAD_ALIGN(len);
637679

638680
if (sk_memalloc_socks())
639681
gfp_mask |= __GFP_MEMALLOC;
@@ -732,8 +774,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
732774
data = page_frag_alloc_1k(&nc->page_small, gfp_mask);
733775
pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small);
734776
} else {
735-
len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
736-
len = SKB_DATA_ALIGN(len);
777+
len = SKB_HEAD_ALIGN(len);
737778

738779
data = page_frag_alloc(&nc->page, len, gfp_mask);
739780
pfmemalloc = nc->page.pfmemalloc;
@@ -809,6 +850,16 @@ static bool skb_pp_recycle(struct sk_buff *skb, void *data)
809850
return page_pool_return_skb_page(virt_to_page(data));
810851
}
811852

853+
static void skb_kfree_head(void *head, unsigned int end_offset)
854+
{
855+
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
856+
if (end_offset == SKB_SMALL_HEAD_HEADROOM)
857+
kmem_cache_free(skb_small_head_cache, head);
858+
else
859+
#endif
860+
kfree(head);
861+
}
862+
812863
static void skb_free_head(struct sk_buff *skb)
813864
{
814865
unsigned char *head = skb->head;
@@ -818,7 +869,7 @@ static void skb_free_head(struct sk_buff *skb)
818869
return;
819870
skb_free_frag(head);
820871
} else {
821-
kfree(head);
872+
skb_kfree_head(head, skb_end_offset(skb));
822873
}
823874
}
824875

@@ -1938,10 +1989,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
19381989
if (skb_pfmemalloc(skb))
19391990
gfp_mask |= __GFP_MEMALLOC;
19401991

1941-
size = SKB_DATA_ALIGN(size);
1942-
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
1943-
size = kmalloc_size_roundup(size);
1944-
data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
1992+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
19451993
if (!data)
19461994
goto nodata;
19471995
size = SKB_WITH_OVERHEAD(size);
@@ -2004,7 +2052,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
20042052
return 0;
20052053

20062054
nofrags:
2007-
kfree(data);
2055+
skb_kfree_head(data, size);
20082056
nodata:
20092057
return -ENOMEM;
20102058
}
@@ -4641,6 +4689,13 @@ void __init skb_init(void)
46414689
0,
46424690
SLAB_HWCACHE_ALIGN|SLAB_PANIC,
46434691
NULL);
4692+
#ifdef HAVE_SKB_SMALL_HEAD_CACHE
4693+
skb_small_head_cache = kmem_cache_create("skbuff_small_head",
4694+
SKB_SMALL_HEAD_CACHE_SIZE,
4695+
0,
4696+
SLAB_HWCACHE_ALIGN | SLAB_PANIC,
4697+
NULL);
4698+
#endif
46444699
skb_extensions_init();
46454700
}
46464701

@@ -6289,10 +6344,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
62896344
if (skb_pfmemalloc(skb))
62906345
gfp_mask |= __GFP_MEMALLOC;
62916346

6292-
size = SKB_DATA_ALIGN(size);
6293-
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
6294-
size = kmalloc_size_roundup(size);
6295-
data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
6347+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
62966348
if (!data)
62976349
return -ENOMEM;
62986350
size = SKB_WITH_OVERHEAD(size);
@@ -6308,7 +6360,7 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off,
63086360
if (skb_cloned(skb)) {
63096361
/* drop the old head gracefully */
63106362
if (skb_orphan_frags(skb, gfp_mask)) {
6311-
kfree(data);
6363+
skb_kfree_head(data, size);
63126364
return -ENOMEM;
63136365
}
63146366
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
@@ -6408,18 +6460,15 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
64086460
if (skb_pfmemalloc(skb))
64096461
gfp_mask |= __GFP_MEMALLOC;
64106462

6411-
size = SKB_DATA_ALIGN(size);
6412-
size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
6413-
size = kmalloc_size_roundup(size);
6414-
data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL);
6463+
data = kmalloc_reserve(&size, gfp_mask, NUMA_NO_NODE, NULL);
64156464
if (!data)
64166465
return -ENOMEM;
64176466
size = SKB_WITH_OVERHEAD(size);
64186467

64196468
memcpy((struct skb_shared_info *)(data + size),
64206469
skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0]));
64216470
if (skb_orphan_frags(skb, gfp_mask)) {
6422-
kfree(data);
6471+
skb_kfree_head(data, size);
64236472
return -ENOMEM;
64246473
}
64256474
shinfo = (struct skb_shared_info *)(data + size);
@@ -6455,7 +6504,7 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off,
64556504
/* skb_frag_unref() is not needed here as shinfo->nr_frags = 0. */
64566505
if (skb_has_frag_list(skb))
64576506
kfree_skb_list(skb_shinfo(skb)->frag_list);
6458-
kfree(data);
6507+
skb_kfree_head(data, size);
64596508
return -ENOMEM;
64606509
}
64616510
skb_release_data(skb, SKB_CONSUMED);

0 commit comments

Comments
 (0)