Skip to content

Commit 7225522

Browse files
tehcastertorvalds
authored andcommitted
mm: munlock: batch non-THP page isolation and munlock+putback using pagevec
Currently, munlock_vma_range() calls munlock_vma_page on each page in a loop, which results in repeated taking and releasing of the lru_lock spinlock for isolating pages one by one. This patch batches the munlock operations using an on-stack pagevec, so that isolation is done under single lru_lock. For THP pages, the old behavior is preserved as they might be split while putting them into the pagevec. After this patch, a 9% speedup was measured for munlocking a 56GB large memory area with THP disabled. A new function __munlock_pagevec() is introduced that takes a pagevec and: 1) It clears PageMlocked and isolates all pages under lru_lock. Zone page stats can be also updated using the variant which assumes disabled interrupts. 2) It finishes the munlock and lru putback on all pages under their lock_page. Note that previously, lock_page covered also the PageMlocked clearing and page isolation, but it is not needed for those operations. Signed-off-by: Vlastimil Babka <vbabka@suse.cz> Reviewed-by: Jörn Engel <joern@logfs.org> Acked-by: Mel Gorman <mgorman@suse.de> Cc: Michel Lespinasse <walken@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Rik van Riel <riel@redhat.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 586a32a commit 7225522

File tree

1 file changed

+156
-40
lines changed

1 file changed

+156
-40
lines changed

mm/mlock.c

Lines changed: 156 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -11,13 +11,16 @@
1111
#include <linux/swap.h>
1212
#include <linux/swapops.h>
1313
#include <linux/pagemap.h>
14+
#include <linux/pagevec.h>
1415
#include <linux/mempolicy.h>
1516
#include <linux/syscalls.h>
1617
#include <linux/sched.h>
1718
#include <linux/export.h>
1819
#include <linux/rmap.h>
1920
#include <linux/mmzone.h>
2021
#include <linux/hugetlb.h>
22+
#include <linux/memcontrol.h>
23+
#include <linux/mm_inline.h>
2124

2225
#include "internal.h"
2326

@@ -87,6 +90,47 @@ void mlock_vma_page(struct page *page)
8790
}
8891
}
8992

93+
/*
94+
* Finish munlock after successful page isolation
95+
*
96+
* Page must be locked. This is a wrapper for try_to_munlock()
97+
* and putback_lru_page() with munlock accounting.
98+
*/
99+
static void __munlock_isolated_page(struct page *page)
100+
{
101+
int ret = SWAP_AGAIN;
102+
103+
/*
104+
* Optimization: if the page was mapped just once, that's our mapping
105+
* and we don't need to check all the other vmas.
106+
*/
107+
if (page_mapcount(page) > 1)
108+
ret = try_to_munlock(page);
109+
110+
/* Did try_to_unlock() succeed or punt? */
111+
if (ret != SWAP_MLOCK)
112+
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
113+
114+
putback_lru_page(page);
115+
}
116+
117+
/*
118+
* Accounting for page isolation fail during munlock
119+
*
120+
* Performs accounting when page isolation fails in munlock. There is nothing
121+
* else to do because it means some other task has already removed the page
122+
* from the LRU. putback_lru_page() will take care of removing the page from
123+
* the unevictable list, if necessary. vmscan [page_referenced()] will move
124+
* the page back to the unevictable list if some other vma has it mlocked.
125+
*/
126+
static void __munlock_isolation_failed(struct page *page)
127+
{
128+
if (PageUnevictable(page))
129+
count_vm_event(UNEVICTABLE_PGSTRANDED);
130+
else
131+
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
132+
}
133+
90134
/**
91135
* munlock_vma_page - munlock a vma page
92136
* @page - page to be unlocked
@@ -112,37 +156,10 @@ unsigned int munlock_vma_page(struct page *page)
112156
unsigned int nr_pages = hpage_nr_pages(page);
113157
mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
114158
page_mask = nr_pages - 1;
115-
if (!isolate_lru_page(page)) {
116-
int ret = SWAP_AGAIN;
117-
118-
/*
119-
* Optimization: if the page was mapped just once,
120-
* that's our mapping and we don't need to check all the
121-
* other vmas.
122-
*/
123-
if (page_mapcount(page) > 1)
124-
ret = try_to_munlock(page);
125-
/*
126-
* did try_to_unlock() succeed or punt?
127-
*/
128-
if (ret != SWAP_MLOCK)
129-
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
130-
131-
putback_lru_page(page);
132-
} else {
133-
/*
134-
* Some other task has removed the page from the LRU.
135-
* putback_lru_page() will take care of removing the
136-
* page from the unevictable list, if necessary.
137-
* vmscan [page_referenced()] will move the page back
138-
* to the unevictable list if some other vma has it
139-
* mlocked.
140-
*/
141-
if (PageUnevictable(page))
142-
count_vm_event(UNEVICTABLE_PGSTRANDED);
143-
else
144-
count_vm_event(UNEVICTABLE_PGMUNLOCKED);
145-
}
159+
if (!isolate_lru_page(page))
160+
__munlock_isolated_page(page);
161+
else
162+
__munlock_isolation_failed(page);
146163
}
147164

148165
return page_mask;
@@ -209,6 +226,73 @@ static int __mlock_posix_error_return(long retval)
209226
return retval;
210227
}
211228

229+
/*
230+
* Munlock a batch of pages from the same zone
231+
*
232+
* The work is split to two main phases. First phase clears the Mlocked flag
233+
* and attempts to isolate the pages, all under a single zone lru lock.
234+
* The second phase finishes the munlock only for pages where isolation
235+
* succeeded.
236+
*
237+
* Note that pvec is modified during the process. Before returning
238+
* pagevec_reinit() is called on it.
239+
*/
240+
static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
241+
{
242+
int i;
243+
int nr = pagevec_count(pvec);
244+
245+
/* Phase 1: page isolation */
246+
spin_lock_irq(&zone->lru_lock);
247+
for (i = 0; i < nr; i++) {
248+
struct page *page = pvec->pages[i];
249+
250+
if (TestClearPageMlocked(page)) {
251+
struct lruvec *lruvec;
252+
int lru;
253+
254+
/* we have disabled interrupts */
255+
__mod_zone_page_state(zone, NR_MLOCK, -1);
256+
257+
if (PageLRU(page)) {
258+
lruvec = mem_cgroup_page_lruvec(page, zone);
259+
lru = page_lru(page);
260+
261+
get_page(page);
262+
ClearPageLRU(page);
263+
del_page_from_lru_list(page, lruvec, lru);
264+
} else {
265+
__munlock_isolation_failed(page);
266+
goto skip_munlock;
267+
}
268+
269+
} else {
270+
skip_munlock:
271+
/*
272+
* We won't be munlocking this page in the next phase
273+
* but we still need to release the follow_page_mask()
274+
* pin.
275+
*/
276+
pvec->pages[i] = NULL;
277+
put_page(page);
278+
}
279+
}
280+
spin_unlock_irq(&zone->lru_lock);
281+
282+
/* Phase 2: page munlock and putback */
283+
for (i = 0; i < nr; i++) {
284+
struct page *page = pvec->pages[i];
285+
286+
if (page) {
287+
lock_page(page);
288+
__munlock_isolated_page(page);
289+
unlock_page(page);
290+
put_page(page); /* pin from follow_page_mask() */
291+
}
292+
}
293+
pagevec_reinit(pvec);
294+
}
295+
212296
/*
213297
* munlock_vma_pages_range() - munlock all pages in the vma range.'
214298
* @vma - vma containing range to be munlock()ed.
@@ -230,11 +314,16 @@ static int __mlock_posix_error_return(long retval)
230314
void munlock_vma_pages_range(struct vm_area_struct *vma,
231315
unsigned long start, unsigned long end)
232316
{
317+
struct pagevec pvec;
318+
struct zone *zone = NULL;
319+
320+
pagevec_init(&pvec, 0);
233321
vma->vm_flags &= ~VM_LOCKED;
234322

235323
while (start < end) {
236324
struct page *page;
237325
unsigned int page_mask, page_increm;
326+
struct zone *pagezone;
238327

239328
/*
240329
* Although FOLL_DUMP is intended for get_dump_page(),
@@ -246,20 +335,47 @@ void munlock_vma_pages_range(struct vm_area_struct *vma,
246335
page = follow_page_mask(vma, start, FOLL_GET | FOLL_DUMP,
247336
&page_mask);
248337
if (page && !IS_ERR(page)) {
249-
lock_page(page);
250-
/*
251-
* Any THP page found by follow_page_mask() may have
252-
* gotten split before reaching munlock_vma_page(),
253-
* so we need to recompute the page_mask here.
254-
*/
255-
page_mask = munlock_vma_page(page);
256-
unlock_page(page);
257-
put_page(page);
338+
pagezone = page_zone(page);
339+
/* The whole pagevec must be in the same zone */
340+
if (pagezone != zone) {
341+
if (pagevec_count(&pvec))
342+
__munlock_pagevec(&pvec, zone);
343+
zone = pagezone;
344+
}
345+
if (PageTransHuge(page)) {
346+
/*
347+
* THP pages are not handled by pagevec due
348+
* to their possible split (see below).
349+
*/
350+
if (pagevec_count(&pvec))
351+
__munlock_pagevec(&pvec, zone);
352+
lock_page(page);
353+
/*
354+
* Any THP page found by follow_page_mask() may
355+
* have gotten split before reaching
356+
* munlock_vma_page(), so we need to recompute
357+
* the page_mask here.
358+
*/
359+
page_mask = munlock_vma_page(page);
360+
unlock_page(page);
361+
put_page(page); /* follow_page_mask() */
362+
} else {
363+
/*
364+
* Non-huge pages are handled in batches
365+
* via pagevec. The pin from
366+
* follow_page_mask() prevents them from
367+
* collapsing by THP.
368+
*/
369+
if (pagevec_add(&pvec, page) == 0)
370+
__munlock_pagevec(&pvec, zone);
371+
}
258372
}
259373
page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
260374
start += page_increm * PAGE_SIZE;
261375
cond_resched();
262376
}
377+
if (pagevec_count(&pvec))
378+
__munlock_pagevec(&pvec, zone);
263379
}
264380

265381
/*

0 commit comments

Comments
 (0)