Skip to content

Commit 1b7e900

Browse files
ryncsnakpm00
authored andcommitted
mm, swap: use percpu cluster as allocation fast path
Current allocation workflow first traverses the plist with a global lock held, after choosing a device, it uses the percpu cluster on that swap device. This commit moves the percpu cluster variable out of being tied to individual swap devices, making it a global percpu variable, and will be used directly for allocation as a fast path. The global percpu cluster variable will never point to a HDD device, and allocations on a HDD device are still globally serialized. This improves the allocator performance and prepares for removal of the slot cache in later commits. There shouldn't be much observable behavior change, except one thing: this changes how swap device allocation rotation works. Currently, each allocation will rotate the plist, and because of the existence of slot cache (one order 0 allocation usually returns 64 entries), swap devices of the same priority are rotated for every 64 order 0 entries consumed. High order allocations are different, they will bypass the slot cache, and so swap device is rotated for every 16K, 32K, or up to 2M allocation. The rotation rule was never clearly defined or documented, it was changed several times without mentioning. After this commit, and once slot cache is gone in later commits, swap device rotation will happen for every consumed cluster. Ideally non-HDD devices will be rotated if 2M space has been consumed for each order. Fragmented clusters will rotate the device faster, which seems OK. HDD devices is rotated for every allocation regardless of the allocation order, which should be OK too and trivial. This commit also slightly changes allocation behaviour for slot cache. The new added cluster allocation fast path may allocate entries from different device to the slot cache, this is not observable from user space, only impact performance very slightly, and slot cache will be just gone in next commit, so this can be ignored. Link: https://lkml.kernel.org/r/20250313165935.63303-6-ryncsn@gmail.com Signed-off-by: Kairui Song <kasong@tencent.com> Cc: Baolin Wang <baolin.wang@linux.alibaba.com> Cc: Baoquan He <bhe@redhat.com> Cc: Barry Song <v-songbaohua@oppo.com> Cc: Chris Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@linux.alibaba.com> Cc: Hugh Dickins <hughd@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Matthew Wilcow (Oracle) <willy@infradead.org> Cc: Nhat Pham <nphamcs@gmail.com> Cc: Yosry Ahmed <yosryahmed@google.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 280cfcc commit 1b7e900

File tree

2 files changed

+121
-48
lines changed

2 files changed

+121
-48
lines changed

include/linux/swap.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -284,12 +284,10 @@ enum swap_cluster_flags {
284284
#endif
285285

286286
/*
287-
* We assign a cluster to each CPU, so each CPU can allocate swap entry from
288-
* its own cluster and swapout sequentially. The purpose is to optimize swapout
289-
* throughput.
287+
* We keep using same cluster for rotational device so IO will be sequential.
288+
* The purpose is to optimize SWAP throughput on these device.
290289
*/
291-
struct percpu_cluster {
292-
local_lock_t lock; /* Protect the percpu_cluster above */
290+
struct swap_sequential_cluster {
293291
unsigned int next[SWAP_NR_ORDERS]; /* Likely next allocation offset */
294292
};
295293

@@ -315,8 +313,7 @@ struct swap_info_struct {
315313
atomic_long_t frag_cluster_nr[SWAP_NR_ORDERS];
316314
unsigned int pages; /* total of usable pages of swap */
317315
atomic_long_t inuse_pages; /* number of those currently in use */
318-
struct percpu_cluster __percpu *percpu_cluster; /* per cpu's swap location */
319-
struct percpu_cluster *global_cluster; /* Use one global cluster for rotating device */
316+
struct swap_sequential_cluster *global_cluster; /* Use one global cluster for rotating device */
320317
spinlock_t global_cluster_lock; /* Serialize usage of global cluster */
321318
struct rb_root swap_extent_root;/* root of the swap extent rbtree */
322319
struct block_device *bdev; /* swap device or bdev of swap file */

mm/swapfile.c

Lines changed: 117 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,18 @@ static atomic_t proc_poll_event = ATOMIC_INIT(0);
116116

117117
atomic_t nr_rotate_swap = ATOMIC_INIT(0);
118118

119+
struct percpu_swap_cluster {
120+
struct swap_info_struct *si[SWAP_NR_ORDERS];
121+
unsigned long offset[SWAP_NR_ORDERS];
122+
local_lock_t lock;
123+
};
124+
125+
static DEFINE_PER_CPU(struct percpu_swap_cluster, percpu_swap_cluster) = {
126+
.si = { NULL },
127+
.offset = { SWAP_ENTRY_INVALID },
128+
.lock = INIT_LOCAL_LOCK(),
129+
};
130+
119131
static struct swap_info_struct *swap_type_to_swap_info(int type)
120132
{
121133
if (type >= MAX_SWAPFILES)
@@ -539,7 +551,7 @@ static bool swap_do_scheduled_discard(struct swap_info_struct *si)
539551
ci = list_first_entry(&si->discard_clusters, struct swap_cluster_info, list);
540552
/*
541553
* Delete the cluster from list to prepare for discard, but keep
542-
* the CLUSTER_FLAG_DISCARD flag, there could be percpu_cluster
554+
* the CLUSTER_FLAG_DISCARD flag, percpu_swap_cluster could be
543555
* pointing to it, or ran into by relocate_cluster.
544556
*/
545557
list_del(&ci->list);
@@ -805,10 +817,12 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si,
805817
out:
806818
relocate_cluster(si, ci);
807819
unlock_cluster(ci);
808-
if (si->flags & SWP_SOLIDSTATE)
809-
__this_cpu_write(si->percpu_cluster->next[order], next);
810-
else
820+
if (si->flags & SWP_SOLIDSTATE) {
821+
this_cpu_write(percpu_swap_cluster.offset[order], next);
822+
this_cpu_write(percpu_swap_cluster.si[order], si);
823+
} else {
811824
si->global_cluster->next[order] = next;
825+
}
812826
return found;
813827
}
814828

@@ -862,20 +876,18 @@ static void swap_reclaim_work(struct work_struct *work)
862876
}
863877

864878
/*
865-
* Try to get swap entries with specified order from current cpu's swap entry
866-
* pool (a cluster). This might involve allocating a new cluster for current CPU
867-
* too.
879+
* Try to allocate swap entries with specified order and try set a new
880+
* cluster for current CPU too.
868881
*/
869882
static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int order,
870883
unsigned char usage)
871884
{
872885
struct swap_cluster_info *ci;
873-
unsigned int offset, found = 0;
886+
unsigned int offset = SWAP_ENTRY_INVALID, found = SWAP_ENTRY_INVALID;
874887

875888
if (si->flags & SWP_SOLIDSTATE) {
876-
/* Fast path using per CPU cluster */
877-
local_lock(&si->percpu_cluster->lock);
878-
offset = __this_cpu_read(si->percpu_cluster->next[order]);
889+
if (si == this_cpu_read(percpu_swap_cluster.si[order]))
890+
offset = this_cpu_read(percpu_swap_cluster.offset[order]);
879891
} else {
880892
/* Serialize HDD SWAP allocation for each device. */
881893
spin_lock(&si->global_cluster_lock);
@@ -973,9 +985,7 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
973985
}
974986
}
975987
done:
976-
if (si->flags & SWP_SOLIDSTATE)
977-
local_unlock(&si->percpu_cluster->lock);
978-
else
988+
if (!(si->flags & SWP_SOLIDSTATE))
979989
spin_unlock(&si->global_cluster_lock);
980990
return found;
981991
}
@@ -1196,6 +1206,51 @@ static bool get_swap_device_info(struct swap_info_struct *si)
11961206
return true;
11971207
}
11981208

1209+
/*
1210+
* Fast path try to get swap entries with specified order from current
1211+
* CPU's swap entry pool (a cluster).
1212+
*/
1213+
static int swap_alloc_fast(swp_entry_t entries[],
1214+
unsigned char usage,
1215+
int order, int n_goal)
1216+
{
1217+
struct swap_cluster_info *ci;
1218+
struct swap_info_struct *si;
1219+
unsigned int offset, found;
1220+
int n_ret = 0;
1221+
1222+
n_goal = min(n_goal, SWAP_BATCH);
1223+
1224+
/*
1225+
* Once allocated, swap_info_struct will never be completely freed,
1226+
* so checking it's liveness by get_swap_device_info is enough.
1227+
*/
1228+
si = this_cpu_read(percpu_swap_cluster.si[order]);
1229+
offset = this_cpu_read(percpu_swap_cluster.offset[order]);
1230+
if (!si || !offset || !get_swap_device_info(si))
1231+
return 0;
1232+
1233+
while (offset) {
1234+
ci = lock_cluster(si, offset);
1235+
if (!cluster_is_usable(ci, order)) {
1236+
unlock_cluster(ci);
1237+
break;
1238+
}
1239+
if (cluster_is_empty(ci))
1240+
offset = cluster_offset(si, ci);
1241+
found = alloc_swap_scan_cluster(si, ci, offset, order, usage);
1242+
if (!found)
1243+
break;
1244+
entries[n_ret++] = swp_entry(si->type, found);
1245+
if (n_ret == n_goal)
1246+
break;
1247+
offset = this_cpu_read(percpu_swap_cluster.offset[order]);
1248+
}
1249+
1250+
put_swap_device(si);
1251+
return n_ret;
1252+
}
1253+
11991254
int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
12001255
{
12011256
int order = swap_entry_order(entry_order);
@@ -1204,19 +1259,36 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
12041259
int n_ret = 0;
12051260
int node;
12061261

1262+
/* Fast path using percpu cluster */
1263+
local_lock(&percpu_swap_cluster.lock);
1264+
n_ret = swap_alloc_fast(swp_entries,
1265+
SWAP_HAS_CACHE,
1266+
order, n_goal);
1267+
if (n_ret == n_goal)
1268+
goto out;
1269+
1270+
n_goal = min_t(int, n_goal - n_ret, SWAP_BATCH);
1271+
/* Rotate the device and switch to a new cluster */
12071272
spin_lock(&swap_avail_lock);
12081273
start_over:
12091274
node = numa_node_id();
12101275
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
1211-
/* requeue si to after same-priority siblings */
12121276
plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]);
12131277
spin_unlock(&swap_avail_lock);
12141278
if (get_swap_device_info(si)) {
1215-
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
1216-
n_goal, swp_entries, order);
1279+
/*
1280+
* For order 0 allocation, try best to fill the request
1281+
* as it's used by slot cache.
1282+
*
1283+
* For mTHP allocation, it always have n_goal == 1,
1284+
* and falling a mTHP swapin will just make the caller
1285+
* fallback to order 0 allocation, so just bail out.
1286+
*/
1287+
n_ret += scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal,
1288+
swp_entries + n_ret, order);
12171289
put_swap_device(si);
12181290
if (n_ret || size > 1)
1219-
goto check_out;
1291+
goto out;
12201292
}
12211293

12221294
spin_lock(&swap_avail_lock);
@@ -1234,12 +1306,10 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_order)
12341306
if (plist_node_empty(&next->avail_lists[node]))
12351307
goto start_over;
12361308
}
1237-
12381309
spin_unlock(&swap_avail_lock);
1239-
1240-
check_out:
1310+
out:
1311+
local_unlock(&percpu_swap_cluster.lock);
12411312
atomic_long_sub(n_ret * size, &nr_swap_pages);
1242-
12431313
return n_ret;
12441314
}
12451315

@@ -2597,6 +2667,28 @@ static void wait_for_allocation(struct swap_info_struct *si)
25972667
}
25982668
}
25992669

2670+
/*
2671+
* Called after swap device's reference count is dead, so
2672+
* neither scan nor allocation will use it.
2673+
*/
2674+
static void flush_percpu_swap_cluster(struct swap_info_struct *si)
2675+
{
2676+
int cpu, i;
2677+
struct swap_info_struct **pcp_si;
2678+
2679+
for_each_possible_cpu(cpu) {
2680+
pcp_si = per_cpu_ptr(percpu_swap_cluster.si, cpu);
2681+
/*
2682+
* Invalidate the percpu swap cluster cache, si->users
2683+
* is dead, so no new user will point to it, just flush
2684+
* any existing user.
2685+
*/
2686+
for (i = 0; i < SWAP_NR_ORDERS; i++)
2687+
cmpxchg(&pcp_si[i], si, NULL);
2688+
}
2689+
}
2690+
2691+
26002692
SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
26012693
{
26022694
struct swap_info_struct *p = NULL;
@@ -2698,6 +2790,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
26982790

26992791
flush_work(&p->discard_work);
27002792
flush_work(&p->reclaim_work);
2793+
flush_percpu_swap_cluster(p);
27012794

27022795
destroy_swap_extents(p);
27032796
if (p->flags & SWP_CONTINUED)
@@ -2725,8 +2818,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
27252818
arch_swap_invalidate_area(p->type);
27262819
zswap_swapoff(p->type);
27272820
mutex_unlock(&swapon_mutex);
2728-
free_percpu(p->percpu_cluster);
2729-
p->percpu_cluster = NULL;
27302821
kfree(p->global_cluster);
27312822
p->global_cluster = NULL;
27322823
vfree(swap_map);
@@ -3125,7 +3216,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
31253216
unsigned long nr_clusters = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
31263217
struct swap_cluster_info *cluster_info;
31273218
unsigned long i, j, idx;
3128-
int cpu, err = -ENOMEM;
3219+
int err = -ENOMEM;
31293220

31303221
cluster_info = kvcalloc(nr_clusters, sizeof(*cluster_info), GFP_KERNEL);
31313222
if (!cluster_info)
@@ -3134,20 +3225,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si,
31343225
for (i = 0; i < nr_clusters; i++)
31353226
spin_lock_init(&cluster_info[i].lock);
31363227

3137-
if (si->flags & SWP_SOLIDSTATE) {
3138-
si->percpu_cluster = alloc_percpu(struct percpu_cluster);
3139-
if (!si->percpu_cluster)
3140-
goto err_free;
3141-
3142-
for_each_possible_cpu(cpu) {
3143-
struct percpu_cluster *cluster;
3144-
3145-
cluster = per_cpu_ptr(si->percpu_cluster, cpu);
3146-
for (i = 0; i < SWAP_NR_ORDERS; i++)
3147-
cluster->next[i] = SWAP_ENTRY_INVALID;
3148-
local_lock_init(&cluster->lock);
3149-
}
3150-
} else {
3228+
if (!(si->flags & SWP_SOLIDSTATE)) {
31513229
si->global_cluster = kmalloc(sizeof(*si->global_cluster),
31523230
GFP_KERNEL);
31533231
if (!si->global_cluster)
@@ -3424,8 +3502,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
34243502
bad_swap_unlock_inode:
34253503
inode_unlock(inode);
34263504
bad_swap:
3427-
free_percpu(si->percpu_cluster);
3428-
si->percpu_cluster = NULL;
34293505
kfree(si->global_cluster);
34303506
si->global_cluster = NULL;
34313507
inode = NULL;

0 commit comments

Comments
 (0)