Skip to content

Commit 661383c

Browse files
ryncsnakpm00
authored andcommitted
mm: swap: relaim the cached parts that got scanned
This commit implements reclaim during scan for cluster allocator. Cluster scanning were unable to reuse SWAP_HAS_CACHE slots, which could result in low allocation success rate or early OOM. So to ensure maximum allocation success rate, integrate reclaiming with scanning. If found a range of suitable swap slots but fragmented due to HAS_CACHE, just try to reclaim the slots. Link: https://lkml.kernel.org/r/20240730-swap-allocator-v5-8-cb9c148b9297@kernel.org Signed-off-by: Kairui Song <kasong@tencent.com> Reported-by: Barry Song <21cnbao@gmail.com> Cc: Chris Li <chrisl@kernel.org> Cc: "Huang, Ying" <ying.huang@intel.com> Cc: Hugh Dickins <hughd@google.com> Cc: Kalesh Singh <kaleshsingh@google.com> Cc: Ryan Roberts <ryan.roberts@arm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 477cb7b commit 661383c

File tree

2 files changed

+110
-31
lines changed

2 files changed

+110
-31
lines changed

include/linux/swap.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ struct swap_info_struct {
301301
/* list of cluster that contains at least one free slot */
302302
struct list_head frag_clusters[SWAP_NR_ORDERS];
303303
/* list of cluster that are fragmented or contented */
304+
unsigned int frag_cluster_nr[SWAP_NR_ORDERS];
304305
unsigned int lowest_bit; /* index of first free in swap_map */
305306
unsigned int highest_bit; /* index of last free in swap_map */
306307
unsigned int pages; /* total of usable pages of swap */

mm/swapfile.c

Lines changed: 109 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
513513
VM_BUG_ON(ci->count != 0);
514514
lockdep_assert_held(&si->lock);
515515
lockdep_assert_held(&ci->lock);
516+
517+
if (ci->flags & CLUSTER_FLAG_FRAG)
518+
si->frag_cluster_nr[ci->order]--;
519+
516520
/*
517521
* If the swap is discardable, prepare discard the cluster
518522
* instead of free it immediately. The cluster will be freed
@@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
572576

573577
if (!(ci->flags & CLUSTER_FLAG_NONFULL)) {
574578
VM_BUG_ON(ci->flags & CLUSTER_FLAG_FREE);
575-
if (ci->flags & CLUSTER_FLAG_FRAG)
579+
if (ci->flags & CLUSTER_FLAG_FRAG) {
580+
p->frag_cluster_nr[ci->order]--;
576581
list_move_tail(&ci->list, &p->nonfull_clusters[ci->order]);
577-
else
582+
} else {
578583
list_add_tail(&ci->list, &p->nonfull_clusters[ci->order]);
584+
}
579585
ci->flags = CLUSTER_FLAG_NONFULL;
580586
}
581587
}
582588

583-
static inline bool cluster_scan_range(struct swap_info_struct *si, unsigned int start,
584-
unsigned int nr_pages)
589+
static bool cluster_reclaim_range(struct swap_info_struct *si,
590+
struct swap_cluster_info *ci,
591+
unsigned long start, unsigned long end)
585592
{
586-
unsigned char *p = si->swap_map + start;
587-
unsigned char *end = p + nr_pages;
593+
unsigned char *map = si->swap_map;
594+
unsigned long offset;
595+
596+
spin_unlock(&ci->lock);
597+
spin_unlock(&si->lock);
598+
599+
for (offset = start; offset < end; offset++) {
600+
switch (READ_ONCE(map[offset])) {
601+
case 0:
602+
continue;
603+
case SWAP_HAS_CACHE:
604+
if (__try_to_reclaim_swap(si, offset, TTRS_ANYWAY | TTRS_DIRECT) > 0)
605+
continue;
606+
goto out;
607+
default:
608+
goto out;
609+
}
610+
}
611+
out:
612+
spin_lock(&si->lock);
613+
spin_lock(&ci->lock);
588614

589-
while (p < end)
590-
if (*p++)
615+
/*
616+
* Recheck the range no matter reclaim succeeded or not, the slot
617+
* could have been be freed while we are not holding the lock.
618+
*/
619+
for (offset = start; offset < end; offset++)
620+
if (READ_ONCE(map[offset]))
591621
return false;
592622

593623
return true;
594624
}
595625

626+
static bool cluster_scan_range(struct swap_info_struct *si,
627+
struct swap_cluster_info *ci,
628+
unsigned long start, unsigned int nr_pages)
629+
{
630+
unsigned long offset, end = start + nr_pages;
631+
unsigned char *map = si->swap_map;
632+
bool need_reclaim = false;
596633

597-
static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
598-
unsigned int start, unsigned char usage,
599-
unsigned int order)
634+
for (offset = start; offset < end; offset++) {
635+
switch (READ_ONCE(map[offset])) {
636+
case 0:
637+
continue;
638+
case SWAP_HAS_CACHE:
639+
if (!vm_swap_full())
640+
return false;
641+
need_reclaim = true;
642+
continue;
643+
default:
644+
return false;
645+
}
646+
}
647+
648+
if (need_reclaim)
649+
return cluster_reclaim_range(si, ci, start, end);
650+
651+
return true;
652+
}
653+
654+
static void cluster_alloc_range(struct swap_info_struct *si, struct swap_cluster_info *ci,
655+
unsigned int start, unsigned char usage,
656+
unsigned int order)
600657
{
601658
unsigned int nr_pages = 1 << order;
602659

@@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
615672
if (ci->count == SWAPFILE_CLUSTER) {
616673
VM_BUG_ON(!(ci->flags &
617674
(CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG)));
675+
if (ci->flags & CLUSTER_FLAG_FRAG)
676+
si->frag_cluster_nr[ci->order]--;
618677
list_del(&ci->list);
619678
ci->flags = 0;
620679
}
@@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
640699
}
641700

642701
while (offset <= end) {
643-
if (cluster_scan_range(si, offset, nr_pages)) {
702+
if (cluster_scan_range(si, ci, offset, nr_pages)) {
644703
cluster_alloc_range(si, ci, offset, usage, order);
645704
*foundp = offset;
646705
if (ci->count == SWAPFILE_CLUSTER) {
@@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
668727
unsigned char usage)
669728
{
670729
struct percpu_cluster *cluster;
671-
struct swap_cluster_info *ci, *n;
730+
struct swap_cluster_info *ci;
672731
unsigned int offset, found = 0;
673-
LIST_HEAD(fraged);
674732

675733
new_cluster:
676734
lockdep_assert_held(&si->lock);
@@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
690748
}
691749

692750
if (order < PMD_ORDER) {
693-
list_for_each_entry_safe(ci, n, &si->nonfull_clusters[order], list) {
694-
list_move_tail(&ci->list, &fraged);
751+
unsigned int frags = 0;
752+
753+
while (!list_empty(&si->nonfull_clusters[order])) {
754+
ci = list_first_entry(&si->nonfull_clusters[order],
755+
struct swap_cluster_info, list);
756+
list_move_tail(&ci->list, &si->frag_clusters[order]);
695757
ci->flags = CLUSTER_FLAG_FRAG;
758+
si->frag_cluster_nr[order]++;
696759
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
697760
&found, order, usage);
761+
frags++;
698762
if (found)
699763
break;
700764
}
701765

702766
if (!found) {
703-
list_for_each_entry_safe(ci, n, &si->frag_clusters[order], list) {
767+
/*
768+
* Nonfull clusters are moved to frag tail if we reached
769+
* here, count them too, don't over scan the frag list.
770+
*/
771+
while (frags < si->frag_cluster_nr[order]) {
772+
ci = list_first_entry(&si->frag_clusters[order],
773+
struct swap_cluster_info, list);
774+
/*
775+
* Rotate the frag list to iterate, they were all failing
776+
* high order allocation or moved here due to per-CPU usage,
777+
* this help keeping usable cluster ahead.
778+
*/
779+
list_move_tail(&ci->list, &si->frag_clusters[order]);
704780
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
705781
&found, order, usage);
782+
frags++;
706783
if (found)
707784
break;
708785
}
709786
}
710-
711-
list_splice_tail(&fraged, &si->frag_clusters[order]);
712787
}
713788

714789
if (found)
@@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
729804

730805
/* Order 0 stealing from higher order */
731806
for (int o = 1; o < SWAP_NR_ORDERS; o++) {
732-
if (!list_empty(&si->frag_clusters[o])) {
807+
/*
808+
* Clusters here have at least one usable slots and can't fail order 0
809+
* allocation, but reclaim may drop si->lock and race with another user.
810+
*/
811+
while (!list_empty(&si->frag_clusters[o])) {
733812
ci = list_first_entry(&si->frag_clusters[o],
734813
struct swap_cluster_info, list);
735-
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci), &found,
736-
0, usage);
737-
VM_BUG_ON(!found);
738-
goto done;
814+
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
815+
&found, 0, usage);
816+
if (found)
817+
goto done;
739818
}
740819

741-
if (!list_empty(&si->nonfull_clusters[o])) {
742-
ci = list_first_entry(&si->nonfull_clusters[o], struct swap_cluster_info,
743-
list);
820+
while (!list_empty(&si->nonfull_clusters[o])) {
821+
ci = list_first_entry(&si->nonfull_clusters[o],
822+
struct swap_cluster_info, list);
744823
offset = alloc_swap_scan_cluster(si, cluster_offset(si, ci),
745824
&found, 0, usage);
746-
VM_BUG_ON(!found);
747-
goto done;
825+
if (found)
826+
goto done;
748827
}
749828
}
750-
751829
done:
752830
cluster->next[order] = offset;
753831
return found;
@@ -3042,6 +3120,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
30423120
for (i = 0; i < SWAP_NR_ORDERS; i++) {
30433121
INIT_LIST_HEAD(&p->nonfull_clusters[i]);
30443122
INIT_LIST_HEAD(&p->frag_clusters[i]);
3123+
p->frag_cluster_nr[i] = 0;
30453124
}
30463125

30473126
for (i = 0; i < swap_header->info.nr_badpages; i++) {
@@ -3085,7 +3164,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
30853164
if (!cluster_info)
30863165
return nr_extents;
30873166

3088-
30893167
/*
30903168
* Reduce false cache line sharing between cluster_info and
30913169
* sharing same address space.

0 commit comments

Comments
 (0)