Skip to content

Commit e66f17f

Browse files
Naoya Horiguchitorvalds
Naoya Horiguchi
authored andcommitted
mm/hugetlb: take page table lock in follow_huge_pmd()
We have a race condition between move_pages() and freeing hugepages, where move_pages() calls follow_page(FOLL_GET) for hugepages internally and tries to get its refcount without preventing concurrent freeing. This race crashes the kernel, so this patch fixes it by moving FOLL_GET code for hugepages into follow_huge_pmd() with taking the page table lock. This patch intentionally removes page==NULL check after pte_page. This is justified because pte_page() never returns NULL for any architectures or configurations. This patch changes the behavior of follow_huge_pmd() for tail pages and then tail pages can be pinned/returned. So the caller must be changed to properly handle the returned tail pages. We could have a choice to add the similar locking to follow_huge_(addr|pud) for consistency, but it's not necessary because currently these functions don't support FOLL_GET flag, so let's leave it for future development. Here is the reproducer: $ cat movepages.c #include <stdio.h> #include <stdlib.h> #include <numaif.h> #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 #define PS 0x1000 int main(int argc, char *argv[]) { int i; int nr_hp = strtol(argv[1], NULL, 0); int nr_p = nr_hp * HPS / PS; int ret; void **addrs; int *status; int *nodes; pid_t pid; pid = strtol(argv[2], NULL, 0); addrs = malloc(sizeof(char *) * nr_p + 1); status = malloc(sizeof(char *) * nr_p + 1); nodes = malloc(sizeof(char *) * nr_p + 1); while (1) { for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 1; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); for (i = 0; i < nr_p; i++) { addrs[i] = (void *)ADDR_INPUT + i * PS; nodes[i] = 0; status[i] = 0; } ret = numa_move_pages(pid, nr_p, addrs, nodes, status, MPOL_MF_MOVE_ALL); if (ret == -1) err("move_pages"); } return 0; } $ cat hugepage.c #include <stdio.h> #include <sys/mman.h> #include <string.h> #define ADDR_INPUT 0x700000000000UL #define HPS 0x200000 int main(int argc, char *argv[]) { int nr_hp = strtol(argv[1], NULL, 0); char *p; while (1) { p = mmap((void *)ADDR_INPUT, nr_hp * HPS, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); if (p != (void *)ADDR_INPUT) { perror("mmap"); break; } memset(p, 0, nr_hp * HPS); munmap(p, nr_hp * HPS); } } $ sysctl vm.nr_hugepages=40 $ ./hugepage 10 & $ ./movepages 10 $(pgrep -f hugepage) Fixes: e632a93 ("mm: migrate: add hugepage migration code to move_pages()") Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com> Reported-by: Hugh Dickins <hughd@google.com> Cc: James Hogan <james.hogan@imgtec.com> Cc: David Rientjes <rientjes@google.com> Cc: Mel Gorman <mel@csn.ul.ie> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Michal Hocko <mhocko@suse.cz> Cc: Rik van Riel <riel@redhat.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Luiz Capitulino <lcapitulino@redhat.com> Cc: Nishanth Aravamudan <nacc@linux.vnet.ibm.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Steve Capper <steve.capper@linaro.org> Cc: <stable@vger.kernel.org> [3.12+] Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent cbef847 commit e66f17f

File tree

5 files changed

+53
-37
lines changed

5 files changed

+53
-37
lines changed

include/linux/hugetlb.h

+4-4
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,9 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
9999
struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
100100
int write);
101101
struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
102-
pmd_t *pmd, int write);
102+
pmd_t *pmd, int flags);
103103
struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address,
104-
pud_t *pud, int write);
104+
pud_t *pud, int flags);
105105
int pmd_huge(pmd_t pmd);
106106
int pud_huge(pud_t pmd);
107107
unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
@@ -133,8 +133,8 @@ static inline void hugetlb_report_meminfo(struct seq_file *m)
133133
static inline void hugetlb_show_meminfo(void)
134134
{
135135
}
136-
#define follow_huge_pmd(mm, addr, pmd, write) NULL
137-
#define follow_huge_pud(mm, addr, pud, write) NULL
136+
#define follow_huge_pmd(mm, addr, pmd, flags) NULL
137+
#define follow_huge_pud(mm, addr, pud, flags) NULL
138138
#define prepare_hugepage_range(file, addr, len) (-EINVAL)
139139
#define pmd_huge(x) 0
140140
#define pud_huge(x) 0

include/linux/swapops.h

+4
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,8 @@ static inline void make_migration_entry_read(swp_entry_t *entry)
135135
*entry = swp_entry(SWP_MIGRATION_READ, swp_offset(*entry));
136136
}
137137

138+
extern void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
139+
spinlock_t *ptl);
138140
extern void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
139141
unsigned long address);
140142
extern void migration_entry_wait_huge(struct vm_area_struct *vma,
@@ -148,6 +150,8 @@ static inline int is_migration_entry(swp_entry_t swp)
148150
}
149151
#define migration_entry_to_page(swp) NULL
150152
static inline void make_migration_entry_read(swp_entry_t *entryp) { }
153+
static inline void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
154+
spinlock_t *ptl) { }
151155
static inline void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
152156
unsigned long address) { }
153157
static inline void migration_entry_wait_huge(struct vm_area_struct *vma,

mm/gup.c

+8-17
Original file line numberDiff line numberDiff line change
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
167167
if (pud_none(*pud))
168168
return no_page_table(vma, flags);
169169
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
170-
if (flags & FOLL_GET)
171-
return NULL;
172-
page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
173-
return page;
170+
page = follow_huge_pud(mm, address, pud, flags);
171+
if (page)
172+
return page;
173+
return no_page_table(vma, flags);
174174
}
175175
if (unlikely(pud_bad(*pud)))
176176
return no_page_table(vma, flags);
@@ -179,19 +179,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
179179
if (pmd_none(*pmd))
180180
return no_page_table(vma, flags);
181181
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
182-
page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
183-
if (flags & FOLL_GET) {
184-
/*
185-
* Refcount on tail pages are not well-defined and
186-
* shouldn't be taken. The caller should handle a NULL
187-
* return when trying to follow tail pages.
188-
*/
189-
if (PageHead(page))
190-
get_page(page);
191-
else
192-
page = NULL;
193-
}
194-
return page;
182+
page = follow_huge_pmd(mm, address, pmd, flags);
183+
if (page)
184+
return page;
185+
return no_page_table(vma, flags);
195186
}
196187
if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
197188
return no_page_table(vma, flags);

mm/hugetlb.c

+34-14
Original file line numberDiff line numberDiff line change
@@ -3675,28 +3675,48 @@ follow_huge_addr(struct mm_struct *mm, unsigned long address,
36753675

36763676
struct page * __weak
36773677
follow_huge_pmd(struct mm_struct *mm, unsigned long address,
3678-
pmd_t *pmd, int write)
3678+
pmd_t *pmd, int flags)
36793679
{
3680-
struct page *page;
3681-
3682-
if (!pmd_present(*pmd))
3683-
return NULL;
3684-
page = pte_page(*(pte_t *)pmd);
3685-
if (page)
3686-
page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
3680+
struct page *page = NULL;
3681+
spinlock_t *ptl;
3682+
retry:
3683+
ptl = pmd_lockptr(mm, pmd);
3684+
spin_lock(ptl);
3685+
/*
3686+
* make sure that the address range covered by this pmd is not
3687+
* unmapped from other threads.
3688+
*/
3689+
if (!pmd_huge(*pmd))
3690+
goto out;
3691+
if (pmd_present(*pmd)) {
3692+
page = pte_page(*(pte_t *)pmd) +
3693+
((address & ~PMD_MASK) >> PAGE_SHIFT);
3694+
if (flags & FOLL_GET)
3695+
get_page(page);
3696+
} else {
3697+
if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
3698+
spin_unlock(ptl);
3699+
__migration_entry_wait(mm, (pte_t *)pmd, ptl);
3700+
goto retry;
3701+
}
3702+
/*
3703+
* hwpoisoned entry is treated as no_page_table in
3704+
* follow_page_mask().
3705+
*/
3706+
}
3707+
out:
3708+
spin_unlock(ptl);
36873709
return page;
36883710
}
36893711

36903712
struct page * __weak
36913713
follow_huge_pud(struct mm_struct *mm, unsigned long address,
3692-
pud_t *pud, int write)
3714+
pud_t *pud, int flags)
36933715
{
3694-
struct page *page;
3716+
if (flags & FOLL_GET)
3717+
return NULL;
36953718

3696-
page = pte_page(*(pte_t *)pud);
3697-
if (page)
3698-
page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
3699-
return page;
3719+
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
37003720
}
37013721

37023722
#ifdef CONFIG_MEMORY_FAILURE

mm/migrate.c

+3-2
Original file line numberDiff line numberDiff line change
@@ -197,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
197197
* get to the page and wait until migration is finished.
198198
* When we return from this function the fault will be retried.
199199
*/
200-
static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
200+
void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
201201
spinlock_t *ptl)
202202
{
203203
pte_t pte;
@@ -1236,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
12361236
goto put_and_set;
12371237

12381238
if (PageHuge(page)) {
1239-
isolate_huge_page(page, &pagelist);
1239+
if (PageHead(page))
1240+
isolate_huge_page(page, &pagelist);
12401241
goto put_and_set;
12411242
}
12421243

0 commit comments

Comments
 (0)