@@ -513,6 +513,10 @@ static void free_cluster(struct swap_info_struct *si, struct swap_cluster_info *
513513 VM_BUG_ON (ci -> count != 0 );
514514 lockdep_assert_held (& si -> lock );
515515 lockdep_assert_held (& ci -> lock );
516+
517+ if (ci -> flags & CLUSTER_FLAG_FRAG )
518+ si -> frag_cluster_nr [ci -> order ]-- ;
519+
516520 /*
517521 * If the swap is discardable, prepare discard the cluster
518522 * instead of free it immediately. The cluster will be freed
@@ -572,31 +576,84 @@ static void dec_cluster_info_page(struct swap_info_struct *p,
572576
573577 if (!(ci -> flags & CLUSTER_FLAG_NONFULL )) {
574578 VM_BUG_ON (ci -> flags & CLUSTER_FLAG_FREE );
575- if (ci -> flags & CLUSTER_FLAG_FRAG )
579+ if (ci -> flags & CLUSTER_FLAG_FRAG ) {
580+ p -> frag_cluster_nr [ci -> order ]-- ;
576581 list_move_tail (& ci -> list , & p -> nonfull_clusters [ci -> order ]);
577- else
582+ } else {
578583 list_add_tail (& ci -> list , & p -> nonfull_clusters [ci -> order ]);
584+ }
579585 ci -> flags = CLUSTER_FLAG_NONFULL ;
580586 }
581587}
582588
583- static inline bool cluster_scan_range (struct swap_info_struct * si , unsigned int start ,
584- unsigned int nr_pages )
589+ static bool cluster_reclaim_range (struct swap_info_struct * si ,
590+ struct swap_cluster_info * ci ,
591+ unsigned long start , unsigned long end )
585592{
586- unsigned char * p = si -> swap_map + start ;
587- unsigned char * end = p + nr_pages ;
593+ unsigned char * map = si -> swap_map ;
594+ unsigned long offset ;
595+
596+ spin_unlock (& ci -> lock );
597+ spin_unlock (& si -> lock );
598+
599+ for (offset = start ; offset < end ; offset ++ ) {
600+ switch (READ_ONCE (map [offset ])) {
601+ case 0 :
602+ continue ;
603+ case SWAP_HAS_CACHE :
604+ if (__try_to_reclaim_swap (si , offset , TTRS_ANYWAY | TTRS_DIRECT ) > 0 )
605+ continue ;
606+ goto out ;
607+ default :
608+ goto out ;
609+ }
610+ }
611+ out :
612+ spin_lock (& si -> lock );
613+ spin_lock (& ci -> lock );
588614
589- while (p < end )
590- if (* p ++ )
615+ /*
616+ * Recheck the range no matter reclaim succeeded or not, the slot
617+ * could have been be freed while we are not holding the lock.
618+ */
619+ for (offset = start ; offset < end ; offset ++ )
620+ if (READ_ONCE (map [offset ]))
591621 return false;
592622
593623 return true;
594624}
595625
626+ static bool cluster_scan_range (struct swap_info_struct * si ,
627+ struct swap_cluster_info * ci ,
628+ unsigned long start , unsigned int nr_pages )
629+ {
630+ unsigned long offset , end = start + nr_pages ;
631+ unsigned char * map = si -> swap_map ;
632+ bool need_reclaim = false;
596633
597- static inline void cluster_alloc_range (struct swap_info_struct * si , struct swap_cluster_info * ci ,
598- unsigned int start , unsigned char usage ,
599- unsigned int order )
634+ for (offset = start ; offset < end ; offset ++ ) {
635+ switch (READ_ONCE (map [offset ])) {
636+ case 0 :
637+ continue ;
638+ case SWAP_HAS_CACHE :
639+ if (!vm_swap_full ())
640+ return false;
641+ need_reclaim = true;
642+ continue ;
643+ default :
644+ return false;
645+ }
646+ }
647+
648+ if (need_reclaim )
649+ return cluster_reclaim_range (si , ci , start , end );
650+
651+ return true;
652+ }
653+
654+ static void cluster_alloc_range (struct swap_info_struct * si , struct swap_cluster_info * ci ,
655+ unsigned int start , unsigned char usage ,
656+ unsigned int order )
600657{
601658 unsigned int nr_pages = 1 << order ;
602659
@@ -615,6 +672,8 @@ static inline void cluster_alloc_range(struct swap_info_struct *si, struct swap_
615672 if (ci -> count == SWAPFILE_CLUSTER ) {
616673 VM_BUG_ON (!(ci -> flags &
617674 (CLUSTER_FLAG_FREE | CLUSTER_FLAG_NONFULL | CLUSTER_FLAG_FRAG )));
675+ if (ci -> flags & CLUSTER_FLAG_FRAG )
676+ si -> frag_cluster_nr [ci -> order ]-- ;
618677 list_del (& ci -> list );
619678 ci -> flags = 0 ;
620679 }
@@ -640,7 +699,7 @@ static unsigned int alloc_swap_scan_cluster(struct swap_info_struct *si, unsigne
640699 }
641700
642701 while (offset <= end ) {
643- if (cluster_scan_range (si , offset , nr_pages )) {
702+ if (cluster_scan_range (si , ci , offset , nr_pages )) {
644703 cluster_alloc_range (si , ci , offset , usage , order );
645704 * foundp = offset ;
646705 if (ci -> count == SWAPFILE_CLUSTER ) {
@@ -668,9 +727,8 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
668727 unsigned char usage )
669728{
670729 struct percpu_cluster * cluster ;
671- struct swap_cluster_info * ci , * n ;
730+ struct swap_cluster_info * ci ;
672731 unsigned int offset , found = 0 ;
673- LIST_HEAD (fraged );
674732
675733new_cluster :
676734 lockdep_assert_held (& si -> lock );
@@ -690,25 +748,42 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
690748 }
691749
692750 if (order < PMD_ORDER ) {
693- list_for_each_entry_safe (ci , n , & si -> nonfull_clusters [order ], list ) {
694- list_move_tail (& ci -> list , & fraged );
751+ unsigned int frags = 0 ;
752+
753+ while (!list_empty (& si -> nonfull_clusters [order ])) {
754+ ci = list_first_entry (& si -> nonfull_clusters [order ],
755+ struct swap_cluster_info , list );
756+ list_move_tail (& ci -> list , & si -> frag_clusters [order ]);
695757 ci -> flags = CLUSTER_FLAG_FRAG ;
758+ si -> frag_cluster_nr [order ]++ ;
696759 offset = alloc_swap_scan_cluster (si , cluster_offset (si , ci ),
697760 & found , order , usage );
761+ frags ++ ;
698762 if (found )
699763 break ;
700764 }
701765
702766 if (!found ) {
703- list_for_each_entry_safe (ci , n , & si -> frag_clusters [order ], list ) {
767+ /*
768+ * Nonfull clusters are moved to frag tail if we reached
769+ * here, count them too, don't over scan the frag list.
770+ */
771+ while (frags < si -> frag_cluster_nr [order ]) {
772+ ci = list_first_entry (& si -> frag_clusters [order ],
773+ struct swap_cluster_info , list );
774+ /*
775+ * Rotate the frag list to iterate, they were all failing
776+ * high order allocation or moved here due to per-CPU usage,
777+ * this help keeping usable cluster ahead.
778+ */
779+ list_move_tail (& ci -> list , & si -> frag_clusters [order ]);
704780 offset = alloc_swap_scan_cluster (si , cluster_offset (si , ci ),
705781 & found , order , usage );
782+ frags ++ ;
706783 if (found )
707784 break ;
708785 }
709786 }
710-
711- list_splice_tail (& fraged , & si -> frag_clusters [order ]);
712787 }
713788
714789 if (found )
@@ -729,25 +804,28 @@ static unsigned long cluster_alloc_swap_entry(struct swap_info_struct *si, int o
729804
730805 /* Order 0 stealing from higher order */
731806 for (int o = 1 ; o < SWAP_NR_ORDERS ; o ++ ) {
732- if (!list_empty (& si -> frag_clusters [o ])) {
807+ /*
808+ * Clusters here have at least one usable slots and can't fail order 0
809+ * allocation, but reclaim may drop si->lock and race with another user.
810+ */
811+ while (!list_empty (& si -> frag_clusters [o ])) {
733812 ci = list_first_entry (& si -> frag_clusters [o ],
734813 struct swap_cluster_info , list );
735- offset = alloc_swap_scan_cluster (si , cluster_offset (si , ci ), & found ,
736- 0 , usage );
737- VM_BUG_ON (! found );
738- goto done ;
814+ offset = alloc_swap_scan_cluster (si , cluster_offset (si , ci ),
815+ & found , 0 , usage );
816+ if ( found )
817+ goto done ;
739818 }
740819
741- if (!list_empty (& si -> nonfull_clusters [o ])) {
742- ci = list_first_entry (& si -> nonfull_clusters [o ], struct swap_cluster_info ,
743- list );
820+ while (!list_empty (& si -> nonfull_clusters [o ])) {
821+ ci = list_first_entry (& si -> nonfull_clusters [o ],
822+ struct swap_cluster_info , list );
744823 offset = alloc_swap_scan_cluster (si , cluster_offset (si , ci ),
745824 & found , 0 , usage );
746- VM_BUG_ON (! found );
747- goto done ;
825+ if ( found )
826+ goto done ;
748827 }
749828 }
750-
751829done :
752830 cluster -> next [order ] = offset ;
753831 return found ;
@@ -3042,6 +3120,7 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
30423120 for (i = 0 ; i < SWAP_NR_ORDERS ; i ++ ) {
30433121 INIT_LIST_HEAD (& p -> nonfull_clusters [i ]);
30443122 INIT_LIST_HEAD (& p -> frag_clusters [i ]);
3123+ p -> frag_cluster_nr [i ] = 0 ;
30453124 }
30463125
30473126 for (i = 0 ; i < swap_header -> info .nr_badpages ; i ++ ) {
@@ -3085,7 +3164,6 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p,
30853164 if (!cluster_info )
30863165 return nr_extents ;
30873166
3088-
30893167 /*
30903168 * Reduce false cache line sharing between cluster_info and
30913169 * sharing same address space.
0 commit comments