Skip to content

Commit cc74ca3

Browse files
committed
Merge branch 'sched-cpumask-improve-on-cpumask_local_spread-locality'
Yury Norov says: ==================== sched: cpumask: improve on cpumask_local_spread() locality cpumask_local_spread() currently checks local node for presence of i'th CPU, and then if it finds nothing makes a flat search among all non-local CPUs. We can do it better by checking CPUs per NUMA hops. This has significant performance implications on NUMA machines, for example when using NUMA-aware allocated memory together with NUMA-aware IRQ affinity hints. Performance tests from patch 8 of this series for mellanox network driver show: TCP multi-stream, using 16 iperf3 instances pinned to 16 cores (with aRFS on). Active cores: 64,65,72,73,80,81,88,89,96,97,104,105,112,113,120,121 +-------------------------+-----------+------------------+------------------+ | | BW (Gbps) | TX side CPU util | RX side CPU util | +-------------------------+-----------+------------------+------------------+ | Baseline | 52.3 | 6.4 % | 17.9 % | +-------------------------+-----------+------------------+------------------+ | Applied on TX side only | 52.6 | 5.2 % | 18.5 % | +-------------------------+-----------+------------------+------------------+ | Applied on RX side only | 94.9 | 11.9 % | 27.2 % | +-------------------------+-----------+------------------+------------------+ | Applied on both sides | 95.1 | 8.4 % | 27.3 % | +-------------------------+-----------+------------------+------------------+ Bottleneck in RX side is released, reached linerate (~1.8x speedup). ~30% less cpu util on TX. ==================== Link: https://lore.kernel.org/r/20230121042436.2661843-1-yury.norov@gmail.com Signed-off-by: Jakub Kicinski <kuba@kernel.org>
2 parents 383d9f8 + 2ac4980 commit cc74ca3

File tree

7 files changed

+230
-25
lines changed

7 files changed

+230
-25
lines changed

drivers/net/ethernet/mellanox/mlx5/core/eq.c

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -817,9 +817,12 @@ static void comp_irqs_release(struct mlx5_core_dev *dev)
817817
static int comp_irqs_request(struct mlx5_core_dev *dev)
818818
{
819819
struct mlx5_eq_table *table = dev->priv.eq_table;
820+
const struct cpumask *prev = cpu_none_mask;
821+
const struct cpumask *mask;
820822
int ncomp_eqs = table->num_comp_eqs;
821823
u16 *cpus;
822824
int ret;
825+
int cpu;
823826
int i;
824827

825828
ncomp_eqs = table->num_comp_eqs;
@@ -838,8 +841,19 @@ static int comp_irqs_request(struct mlx5_core_dev *dev)
838841
ret = -ENOMEM;
839842
goto free_irqs;
840843
}
841-
for (i = 0; i < ncomp_eqs; i++)
842-
cpus[i] = cpumask_local_spread(i, dev->priv.numa_node);
844+
845+
i = 0;
846+
rcu_read_lock();
847+
for_each_numa_hop_mask(mask, dev->priv.numa_node) {
848+
for_each_cpu_andnot(cpu, mask, prev) {
849+
cpus[i] = cpu;
850+
if (++i == ncomp_eqs)
851+
goto spread_done;
852+
}
853+
prev = mask;
854+
}
855+
spread_done:
856+
rcu_read_unlock();
843857
ret = mlx5_irqs_request_vectors(dev, cpus, ncomp_eqs, table->comp_irqs);
844858
kfree(cpus);
845859
if (ret < 0)

include/linux/cpumask.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -391,6 +391,26 @@ unsigned int cpumask_nth_andnot(unsigned int cpu, const struct cpumask *srcp1,
391391
nr_cpumask_bits, cpumask_check(cpu));
392392
}
393393

394+
/**
395+
* cpumask_nth_and_andnot - get the Nth cpu set in 1st and 2nd cpumask, and clear in 3rd.
396+
* @srcp1: the cpumask pointer
397+
* @srcp2: the cpumask pointer
398+
* @srcp3: the cpumask pointer
399+
* @cpu: the N'th cpu to find, starting from 0
400+
*
401+
* Returns >= nr_cpu_ids if such cpu doesn't exist.
402+
*/
403+
static __always_inline
404+
unsigned int cpumask_nth_and_andnot(unsigned int cpu, const struct cpumask *srcp1,
405+
const struct cpumask *srcp2,
406+
const struct cpumask *srcp3)
407+
{
408+
return find_nth_and_andnot_bit(cpumask_bits(srcp1),
409+
cpumask_bits(srcp2),
410+
cpumask_bits(srcp3),
411+
nr_cpumask_bits, cpumask_check(cpu));
412+
}
413+
394414
#define CPU_BITS_NONE \
395415
{ \
396416
[0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \

include/linux/find.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@ unsigned long __find_nth_and_bit(const unsigned long *addr1, const unsigned long
2222
unsigned long size, unsigned long n);
2323
unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
2424
unsigned long size, unsigned long n);
25+
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1, const unsigned long *addr2,
26+
const unsigned long *addr3, unsigned long size,
27+
unsigned long n);
2528
extern unsigned long _find_first_and_bit(const unsigned long *addr1,
2629
const unsigned long *addr2, unsigned long size);
2730
extern unsigned long _find_first_zero_bit(const unsigned long *addr, unsigned long size);
@@ -255,6 +258,36 @@ unsigned long find_nth_andnot_bit(const unsigned long *addr1, const unsigned lon
255258
return __find_nth_andnot_bit(addr1, addr2, size, n);
256259
}
257260

261+
/**
262+
* find_nth_and_andnot_bit - find N'th set bit in 2 memory regions,
263+
* excluding those set in 3rd region
264+
* @addr1: The 1st address to start the search at
265+
* @addr2: The 2nd address to start the search at
266+
* @addr3: The 3rd address to start the search at
267+
* @size: The maximum number of bits to search
268+
* @n: The number of set bit, which position is needed, counting from 0
269+
*
270+
* Returns the bit number of the N'th set bit.
271+
* If no such, returns @size.
272+
*/
273+
static __always_inline
274+
unsigned long find_nth_and_andnot_bit(const unsigned long *addr1,
275+
const unsigned long *addr2,
276+
const unsigned long *addr3,
277+
unsigned long size, unsigned long n)
278+
{
279+
if (n >= size)
280+
return size;
281+
282+
if (small_const_nbits(size)) {
283+
unsigned long val = *addr1 & *addr2 & (~*addr3) & GENMASK(size - 1, 0);
284+
285+
return val ? fns(val, n) : size;
286+
}
287+
288+
return __find_nth_and_andnot_bit(addr1, addr2, addr3, size, n);
289+
}
290+
258291
#ifndef find_first_and_bit
259292
/**
260293
* find_first_and_bit - find the first set bit in both memory regions

include/linux/topology.h

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,5 +245,38 @@ static inline const struct cpumask *cpu_cpu_mask(int cpu)
245245
return cpumask_of_node(cpu_to_node(cpu));
246246
}
247247

248+
#ifdef CONFIG_NUMA
249+
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node);
250+
extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops);
251+
#else
252+
static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
253+
{
254+
return cpumask_nth(cpu, cpus);
255+
}
256+
257+
static inline const struct cpumask *
258+
sched_numa_hop_mask(unsigned int node, unsigned int hops)
259+
{
260+
return ERR_PTR(-EOPNOTSUPP);
261+
}
262+
#endif /* CONFIG_NUMA */
263+
264+
/**
265+
* for_each_numa_hop_mask - iterate over cpumasks of increasing NUMA distance
266+
* from a given node.
267+
* @mask: the iteration variable.
268+
* @node: the NUMA node to start the search from.
269+
*
270+
* Requires rcu_lock to be held.
271+
*
272+
* Yields cpu_online_mask for @node == NUMA_NO_NODE.
273+
*/
274+
#define for_each_numa_hop_mask(mask, node) \
275+
for (unsigned int __hops = 0; \
276+
mask = (node != NUMA_NO_NODE || __hops) ? \
277+
sched_numa_hop_mask(node, __hops) : \
278+
cpu_online_mask, \
279+
!IS_ERR_OR_NULL(mask); \
280+
__hops++)
248281

249282
#endif /* _LINUX_TOPOLOGY_H */

kernel/sched/topology.c

Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
* Scheduler topology setup/handling methods
44
*/
55

6+
#include <linux/bsearch.h>
7+
68
DEFINE_MUTEX(sched_domains_mutex);
79

810
/* Protected by sched_domains_mutex: */
@@ -2067,6 +2069,94 @@ int sched_numa_find_closest(const struct cpumask *cpus, int cpu)
20672069
return found;
20682070
}
20692071

2072+
struct __cmp_key {
2073+
const struct cpumask *cpus;
2074+
struct cpumask ***masks;
2075+
int node;
2076+
int cpu;
2077+
int w;
2078+
};
2079+
2080+
static int hop_cmp(const void *a, const void *b)
2081+
{
2082+
struct cpumask **prev_hop = *((struct cpumask ***)b - 1);
2083+
struct cpumask **cur_hop = *(struct cpumask ***)b;
2084+
struct __cmp_key *k = (struct __cmp_key *)a;
2085+
2086+
if (cpumask_weight_and(k->cpus, cur_hop[k->node]) <= k->cpu)
2087+
return 1;
2088+
2089+
k->w = (b == k->masks) ? 0 : cpumask_weight_and(k->cpus, prev_hop[k->node]);
2090+
if (k->w <= k->cpu)
2091+
return 0;
2092+
2093+
return -1;
2094+
}
2095+
2096+
/*
2097+
* sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu
2098+
* closest to @cpu from @cpumask.
2099+
* cpumask: cpumask to find a cpu from
2100+
* cpu: Nth cpu to find
2101+
*
2102+
* returns: cpu, or nr_cpu_ids when nothing found.
2103+
*/
2104+
int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node)
2105+
{
2106+
struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu };
2107+
struct cpumask ***hop_masks;
2108+
int hop, ret = nr_cpu_ids;
2109+
2110+
rcu_read_lock();
2111+
2112+
k.masks = rcu_dereference(sched_domains_numa_masks);
2113+
if (!k.masks)
2114+
goto unlock;
2115+
2116+
hop_masks = bsearch(&k, k.masks, sched_domains_numa_levels, sizeof(k.masks[0]), hop_cmp);
2117+
hop = hop_masks - k.masks;
2118+
2119+
ret = hop ?
2120+
cpumask_nth_and_andnot(cpu - k.w, cpus, k.masks[hop][node], k.masks[hop-1][node]) :
2121+
cpumask_nth_and(cpu, cpus, k.masks[0][node]);
2122+
unlock:
2123+
rcu_read_unlock();
2124+
return ret;
2125+
}
2126+
EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu);
2127+
2128+
/**
2129+
* sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from
2130+
* @node
2131+
* @node: The node to count hops from.
2132+
* @hops: Include CPUs up to that many hops away. 0 means local node.
2133+
*
2134+
* Return: On success, a pointer to a cpumask of CPUs at most @hops away from
2135+
* @node, an error value otherwise.
2136+
*
2137+
* Requires rcu_lock to be held. Returned cpumask is only valid within that
2138+
* read-side section, copy it if required beyond that.
2139+
*
2140+
* Note that not all hops are equal in distance; see sched_init_numa() for how
2141+
* distances and masks are handled.
2142+
* Also note that this is a reflection of sched_domains_numa_masks, which may change
2143+
* during the lifetime of the system (offline nodes are taken out of the masks).
2144+
*/
2145+
const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops)
2146+
{
2147+
struct cpumask ***masks;
2148+
2149+
if (node >= nr_node_ids || hops >= sched_domains_numa_levels)
2150+
return ERR_PTR(-EINVAL);
2151+
2152+
masks = rcu_dereference(sched_domains_numa_masks);
2153+
if (!masks)
2154+
return ERR_PTR(-EBUSY);
2155+
2156+
return masks[hops][node];
2157+
}
2158+
EXPORT_SYMBOL_GPL(sched_numa_hop_mask);
2159+
20702160
#endif /* CONFIG_NUMA */
20712161

20722162
static int __sdt_alloc(const struct cpumask *cpu_map)

lib/cpumask.c

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -110,15 +110,33 @@ void __init free_bootmem_cpumask_var(cpumask_var_t mask)
110110
#endif
111111

112112
/**
113-
* cpumask_local_spread - select the i'th cpu with local numa cpu's first
113+
* cpumask_local_spread - select the i'th cpu based on NUMA distances
114114
* @i: index number
115115
* @node: local numa_node
116116
*
117-
* This function selects an online CPU according to a numa aware policy;
118-
* local cpus are returned first, followed by non-local ones, then it
119-
* wraps around.
117+
* Returns online CPU according to a numa aware policy; local cpus are returned
118+
* first, followed by non-local ones, then it wraps around.
120119
*
121-
* It's not very efficient, but useful for setup.
120+
* For those who wants to enumerate all CPUs based on their NUMA distances,
121+
* i.e. call this function in a loop, like:
122+
*
123+
* for (i = 0; i < num_online_cpus(); i++) {
124+
* cpu = cpumask_local_spread(i, node);
125+
* do_something(cpu);
126+
* }
127+
*
128+
* There's a better alternative based on for_each()-like iterators:
129+
*
130+
* for_each_numa_hop_mask(mask, node) {
131+
* for_each_cpu_andnot(cpu, mask, prev)
132+
* do_something(cpu);
133+
* prev = mask;
134+
* }
135+
*
136+
* It's simpler and more verbose than above. Complexity of iterator-based
137+
* enumeration is O(sched_domains_numa_levels * nr_cpu_ids), while
138+
* cpumask_local_spread() when called for each cpu is
139+
* O(sched_domains_numa_levels * nr_cpu_ids * log(nr_cpu_ids)).
122140
*/
123141
unsigned int cpumask_local_spread(unsigned int i, int node)
124142
{
@@ -127,24 +145,12 @@ unsigned int cpumask_local_spread(unsigned int i, int node)
127145
/* Wrap: we always want a cpu. */
128146
i %= num_online_cpus();
129147

130-
if (node == NUMA_NO_NODE) {
131-
cpu = cpumask_nth(i, cpu_online_mask);
132-
if (cpu < nr_cpu_ids)
133-
return cpu;
134-
} else {
135-
/* NUMA first. */
136-
cpu = cpumask_nth_and(i, cpu_online_mask, cpumask_of_node(node));
137-
if (cpu < nr_cpu_ids)
138-
return cpu;
139-
140-
i -= cpumask_weight_and(cpu_online_mask, cpumask_of_node(node));
141-
142-
/* Skip NUMA nodes, done above. */
143-
cpu = cpumask_nth_andnot(i, cpu_online_mask, cpumask_of_node(node));
144-
if (cpu < nr_cpu_ids)
145-
return cpu;
146-
}
147-
BUG();
148+
cpu = (node == NUMA_NO_NODE) ?
149+
cpumask_nth(i, cpu_online_mask) :
150+
sched_numa_find_nth_cpu(cpu_online_mask, i, node);
151+
152+
WARN_ON(cpu >= nr_cpu_ids);
153+
return cpu;
148154
}
149155
EXPORT_SYMBOL(cpumask_local_spread);
150156

lib/find_bit.c

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,6 +155,15 @@ unsigned long __find_nth_andnot_bit(const unsigned long *addr1, const unsigned l
155155
}
156156
EXPORT_SYMBOL(__find_nth_andnot_bit);
157157

158+
unsigned long __find_nth_and_andnot_bit(const unsigned long *addr1,
159+
const unsigned long *addr2,
160+
const unsigned long *addr3,
161+
unsigned long size, unsigned long n)
162+
{
163+
return FIND_NTH_BIT(addr1[idx] & addr2[idx] & ~addr3[idx], size, n);
164+
}
165+
EXPORT_SYMBOL(__find_nth_and_andnot_bit);
166+
158167
#ifndef find_next_and_bit
159168
unsigned long _find_next_and_bit(const unsigned long *addr1, const unsigned long *addr2,
160169
unsigned long nbits, unsigned long start)

0 commit comments

Comments
 (0)