Skip to content

Commit 27c73ae

Browse files
aagittorvalds
authored andcommitted
mm: hugetlbfs: fix hugetlbfs optimization
Commit 7cb2ef5 ("mm: fix aio performance regression for database caused by THP") can cause dereference of a dangling pointer if split_huge_page runs during PageHuge() if there are updates to the tail_page->private field. Also it is repeating compound_head twice for hugetlbfs and it is running compound_head+compound_trans_head for THP when a single one is needed in both cases. The new code within the PageSlab() check doesn't need to verify that the THP page size is never bigger than the smallest hugetlbfs page size, to avoid memory corruption. A longstanding theoretical race condition was found while fixing the above (see the change right after the skip_unlock label, that is relevant for the compound_lock path too). By re-establishing the _mapcount tail refcounting for all compound pages, this also fixes the below problem: echo 0 >/sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages BUG: Bad page state in process bash pfn:59a01 page:ffffea000139b038 count:0 mapcount:10 mapping: (null) index:0x0 page flags: 0x1c00000000008000(tail) Modules linked in: CPU: 6 PID: 2018 Comm: bash Not tainted 3.12.0+ hardkernel#25 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x55/0x76 bad_page+0xd5/0x130 free_pages_prepare+0x213/0x280 __free_pages+0x36/0x80 update_and_free_page+0xc1/0xd0 free_pool_huge_page+0xc2/0xe0 set_max_huge_pages.part.58+0x14c/0x220 nr_hugepages_store_common.isra.60+0xd0/0xf0 nr_hugepages_store+0x13/0x20 kobj_attr_store+0xf/0x20 sysfs_write_file+0x189/0x1e0 vfs_write+0xc5/0x1f0 SyS_write+0x55/0xb0 system_call_fastpath+0x16/0x1b Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com> Signed-off-by: Andrea Arcangeli <aarcange@redhat.com> Tested-by: Khalid Aziz <khalid.aziz@oracle.com> Cc: Pravin Shelar <pshelar@nicira.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Ben Hutchings <bhutchings@solarflare.com> Cc: Christoph Lameter <cl@linux.com> Cc: Johannes Weiner <jweiner@redhat.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Rik van Riel <riel@redhat.com> Cc: Andi Kleen <andi@firstfloor.org> Cc: Minchan Kim <minchan@kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 044c8d4 commit 27c73ae

File tree

3 files changed

+106
-60
lines changed

3 files changed

+106
-60
lines changed

include/linux/hugetlb.h

+6
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ struct hugepage_subpool *hugepage_new_subpool(long nr_blocks);
3131
void hugepage_put_subpool(struct hugepage_subpool *spool);
3232

3333
int PageHuge(struct page *page);
34+
int PageHeadHuge(struct page *page_head);
3435

3536
void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
3637
int hugetlb_sysctl_handler(struct ctl_table *, int, void __user *, size_t *, loff_t *);
@@ -103,6 +104,11 @@ static inline int PageHuge(struct page *page)
103104
return 0;
104105
}
105106

107+
static inline int PageHeadHuge(struct page *page_head)
108+
{
109+
return 0;
110+
}
111+
106112
static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
107113
{
108114
}

mm/hugetlb.c

+17
Original file line numberDiff line numberDiff line change
@@ -702,6 +702,23 @@ int PageHuge(struct page *page)
702702
}
703703
EXPORT_SYMBOL_GPL(PageHuge);
704704

705+
/*
706+
* PageHeadHuge() only returns true for hugetlbfs head page, but not for
707+
* normal or transparent huge pages.
708+
*/
709+
int PageHeadHuge(struct page *page_head)
710+
{
711+
compound_page_dtor *dtor;
712+
713+
if (!PageHead(page_head))
714+
return 0;
715+
716+
dtor = get_compound_page_dtor(page_head);
717+
718+
return dtor == free_huge_page;
719+
}
720+
EXPORT_SYMBOL_GPL(PageHeadHuge);
721+
705722
pgoff_t __basepage_index(struct page *page)
706723
{
707724
struct page *page_head = compound_head(page);

mm/swap.c

+83-60
Original file line numberDiff line numberDiff line change
@@ -82,19 +82,6 @@ static void __put_compound_page(struct page *page)
8282

8383
static void put_compound_page(struct page *page)
8484
{
85-
/*
86-
* hugetlbfs pages cannot be split from under us. If this is a
87-
* hugetlbfs page, check refcount on head page and release the page if
88-
* the refcount becomes zero.
89-
*/
90-
if (PageHuge(page)) {
91-
page = compound_head(page);
92-
if (put_page_testzero(page))
93-
__put_compound_page(page);
94-
95-
return;
96-
}
97-
9885
if (unlikely(PageTail(page))) {
9986
/* __split_huge_page_refcount can run under us */
10087
struct page *page_head = compound_trans_head(page);
@@ -111,14 +98,31 @@ static void put_compound_page(struct page *page)
11198
* still hot on arches that do not support
11299
* this_cpu_cmpxchg_double().
113100
*/
114-
if (PageSlab(page_head)) {
115-
if (PageTail(page)) {
101+
if (PageSlab(page_head) || PageHeadHuge(page_head)) {
102+
if (likely(PageTail(page))) {
103+
/*
104+
* __split_huge_page_refcount
105+
* cannot race here.
106+
*/
107+
VM_BUG_ON(!PageHead(page_head));
108+
atomic_dec(&page->_mapcount);
116109
if (put_page_testzero(page_head))
117110
VM_BUG_ON(1);
118-
119-
atomic_dec(&page->_mapcount);
120-
goto skip_lock_tail;
111+
if (put_page_testzero(page_head))
112+
__put_compound_page(page_head);
113+
return;
121114
} else
115+
/*
116+
* __split_huge_page_refcount
117+
* run before us, "page" was a
118+
* THP tail. The split
119+
* page_head has been freed
120+
* and reallocated as slab or
121+
* hugetlbfs page of smaller
122+
* order (only possible if
123+
* reallocated as slab on
124+
* x86).
125+
*/
122126
goto skip_lock;
123127
}
124128
/*
@@ -132,8 +136,27 @@ static void put_compound_page(struct page *page)
132136
/* __split_huge_page_refcount run before us */
133137
compound_unlock_irqrestore(page_head, flags);
134138
skip_lock:
135-
if (put_page_testzero(page_head))
136-
__put_single_page(page_head);
139+
if (put_page_testzero(page_head)) {
140+
/*
141+
* The head page may have been
142+
* freed and reallocated as a
143+
* compound page of smaller
144+
* order and then freed again.
145+
* All we know is that it
146+
* cannot have become: a THP
147+
* page, a compound page of
148+
* higher order, a tail page.
149+
* That is because we still
150+
* hold the refcount of the
151+
* split THP tail and
152+
* page_head was the THP head
153+
* before the split.
154+
*/
155+
if (PageHead(page_head))
156+
__put_compound_page(page_head);
157+
else
158+
__put_single_page(page_head);
159+
}
137160
out_put_single:
138161
if (put_page_testzero(page))
139162
__put_single_page(page);
@@ -155,7 +178,6 @@ static void put_compound_page(struct page *page)
155178
VM_BUG_ON(atomic_read(&page->_count) != 0);
156179
compound_unlock_irqrestore(page_head, flags);
157180

158-
skip_lock_tail:
159181
if (put_page_testzero(page_head)) {
160182
if (PageHead(page_head))
161183
__put_compound_page(page_head);
@@ -198,51 +220,52 @@ bool __get_page_tail(struct page *page)
198220
* proper PT lock that already serializes against
199221
* split_huge_page().
200222
*/
223+
unsigned long flags;
201224
bool got = false;
202-
struct page *page_head;
203-
204-
/*
205-
* If this is a hugetlbfs page it cannot be split under us. Simply
206-
* increment refcount for the head page.
207-
*/
208-
if (PageHuge(page)) {
209-
page_head = compound_head(page);
210-
atomic_inc(&page_head->_count);
211-
got = true;
212-
} else {
213-
unsigned long flags;
225+
struct page *page_head = compound_trans_head(page);
214226

215-
page_head = compound_trans_head(page);
216-
if (likely(page != page_head &&
217-
get_page_unless_zero(page_head))) {
218-
219-
/* Ref to put_compound_page() comment. */
220-
if (PageSlab(page_head)) {
221-
if (likely(PageTail(page))) {
222-
__get_page_tail_foll(page, false);
223-
return true;
224-
} else {
225-
put_page(page_head);
226-
return false;
227-
}
228-
}
229-
230-
/*
231-
* page_head wasn't a dangling pointer but it
232-
* may not be a head page anymore by the time
233-
* we obtain the lock. That is ok as long as it
234-
* can't be freed from under us.
235-
*/
236-
flags = compound_lock_irqsave(page_head);
237-
/* here __split_huge_page_refcount won't run anymore */
227+
if (likely(page != page_head && get_page_unless_zero(page_head))) {
228+
/* Ref to put_compound_page() comment. */
229+
if (PageSlab(page_head) || PageHeadHuge(page_head)) {
238230
if (likely(PageTail(page))) {
231+
/*
232+
* This is a hugetlbfs page or a slab
233+
* page. __split_huge_page_refcount
234+
* cannot race here.
235+
*/
236+
VM_BUG_ON(!PageHead(page_head));
239237
__get_page_tail_foll(page, false);
240-
got = true;
241-
}
242-
compound_unlock_irqrestore(page_head, flags);
243-
if (unlikely(!got))
238+
return true;
239+
} else {
240+
/*
241+
* __split_huge_page_refcount run
242+
* before us, "page" was a THP
243+
* tail. The split page_head has been
244+
* freed and reallocated as slab or
245+
* hugetlbfs page of smaller order
246+
* (only possible if reallocated as
247+
* slab on x86).
248+
*/
244249
put_page(page_head);
250+
return false;
251+
}
252+
}
253+
254+
/*
255+
* page_head wasn't a dangling pointer but it
256+
* may not be a head page anymore by the time
257+
* we obtain the lock. That is ok as long as it
258+
* can't be freed from under us.
259+
*/
260+
flags = compound_lock_irqsave(page_head);
261+
/* here __split_huge_page_refcount won't run anymore */
262+
if (likely(PageTail(page))) {
263+
__get_page_tail_foll(page, false);
264+
got = true;
245265
}
266+
compound_unlock_irqrestore(page_head, flags);
267+
if (unlikely(!got))
268+
put_page(page_head);
246269
}
247270
return got;
248271
}

0 commit comments

Comments
 (0)