Skip to content
This repository was archived by the owner on Oct 30, 2021. It is now read-only.

Commit 1c9e8de

Browse files
mjkravetztorvalds
authored andcommitted
userfaultfd: hugetlbfs: add UFFDIO_COPY support for shared mappings
When userfaultfd hugetlbfs support was originally added, it followed the pattern of anon mappings and did not support any vmas marked VM_SHARED. As such, support was only added for private mappings. Remove this limitation and support shared mappings. The primary functional change required is adding pages to the page cache. More subtle changes are required for huge page reservation handling in error paths. A lengthy comment in the code describes the reservation handling. [mike.kravetz@oracle.com: update] Link: http://lkml.kernel.org/r/c9c8cafe-baa7-05b4-34ea-1dfa5523a85f@oracle.com Link: http://lkml.kernel.org/r/1487195210-12839-1-git-send-email-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz <mike.kravetz@oracle.com> Reviewed-by: Andrea Arcangeli <aarcange@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: "Dr. David Alan Gilbert" <dgilbert@redhat.com> Cc: Hillf Danton <hillf.zj@alibaba-inc.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Pavel Emelyanov <xemul@parallels.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent cac6732 commit 1c9e8de

File tree

2 files changed

+82
-18
lines changed

2 files changed

+82
-18
lines changed

mm/hugetlb.c

+24-2
Original file line numberDiff line numberDiff line change
@@ -3992,6 +3992,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
39923992
unsigned long src_addr,
39933993
struct page **pagep)
39943994
{
3995+
int vm_shared = dst_vma->vm_flags & VM_SHARED;
39953996
struct hstate *h = hstate_vma(dst_vma);
39963997
pte_t _dst_pte;
39973998
spinlock_t *ptl;
@@ -4028,15 +4029,31 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
40284029
__SetPageUptodate(page);
40294030
set_page_huge_active(page);
40304031

4032+
/*
4033+
* If shared, add to page cache
4034+
*/
4035+
if (vm_shared) {
4036+
struct address_space *mapping = dst_vma->vm_file->f_mapping;
4037+
pgoff_t idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4038+
4039+
ret = huge_add_to_page_cache(page, mapping, idx);
4040+
if (ret)
4041+
goto out_release_nounlock;
4042+
}
4043+
40314044
ptl = huge_pte_lockptr(h, dst_mm, dst_pte);
40324045
spin_lock(ptl);
40334046

40344047
ret = -EEXIST;
40354048
if (!huge_pte_none(huge_ptep_get(dst_pte)))
40364049
goto out_release_unlock;
40374050

4038-
ClearPagePrivate(page);
4039-
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
4051+
if (vm_shared) {
4052+
page_dup_rmap(page, true);
4053+
} else {
4054+
ClearPagePrivate(page);
4055+
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
4056+
}
40404057

40414058
_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
40424059
if (dst_vma->vm_flags & VM_WRITE)
@@ -4053,11 +4070,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
40534070
update_mmu_cache(dst_vma, dst_addr, dst_pte);
40544071

40554072
spin_unlock(ptl);
4073+
if (vm_shared)
4074+
unlock_page(page);
40564075
ret = 0;
40574076
out:
40584077
return ret;
40594078
out_release_unlock:
40604079
spin_unlock(ptl);
4080+
out_release_nounlock:
4081+
if (vm_shared)
4082+
unlock_page(page);
40614083
put_page(page);
40624084
goto out;
40634085
}

mm/userfaultfd.c

+58-16
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,8 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
154154
unsigned long len,
155155
bool zeropage)
156156
{
157+
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
158+
int vm_shared = dst_vma->vm_flags & VM_SHARED;
157159
ssize_t err;
158160
pte_t *dst_pte;
159161
unsigned long src_addr, dst_addr;
@@ -204,14 +206,14 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
204206
goto out_unlock;
205207

206208
/*
207-
* Make sure the vma is not shared, that the remaining dst
208-
* range is both valid and fully within a single existing vma.
209+
* Make sure the remaining dst range is both valid and
210+
* fully within a single existing vma.
209211
*/
210-
if (dst_vma->vm_flags & VM_SHARED)
211-
goto out_unlock;
212212
if (dst_start < dst_vma->vm_start ||
213213
dst_start + len > dst_vma->vm_end)
214214
goto out_unlock;
215+
216+
vm_shared = dst_vma->vm_flags & VM_SHARED;
215217
}
216218

217219
if (WARN_ON(dst_addr & (vma_hpagesize - 1) ||
@@ -225,11 +227,13 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
225227
goto out_unlock;
226228

227229
/*
228-
* Ensure the dst_vma has a anon_vma.
230+
* If not shared, ensure the dst_vma has a anon_vma.
229231
*/
230232
err = -ENOMEM;
231-
if (unlikely(anon_vma_prepare(dst_vma)))
232-
goto out_unlock;
233+
if (!vm_shared) {
234+
if (unlikely(anon_vma_prepare(dst_vma)))
235+
goto out_unlock;
236+
}
233237

234238
h = hstate_vma(dst_vma);
235239

@@ -266,6 +270,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
266270
dst_addr, src_addr, &page);
267271

268272
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
273+
vm_alloc_shared = vm_shared;
269274

270275
cond_resched();
271276

@@ -305,18 +310,49 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
305310
if (page) {
306311
/*
307312
* We encountered an error and are about to free a newly
308-
* allocated huge page. It is possible that there was a
309-
* reservation associated with the page that has been
310-
* consumed. See the routine restore_reserve_on_error
311-
* for details. Unfortunately, we can not call
312-
* restore_reserve_on_error now as it would require holding
313-
* mmap_sem. Clear the PagePrivate flag so that the global
313+
* allocated huge page.
314+
*
315+
* Reservation handling is very subtle, and is different for
316+
* private and shared mappings. See the routine
317+
* restore_reserve_on_error for details. Unfortunately, we
318+
* can not call restore_reserve_on_error now as it would
319+
* require holding mmap_sem.
320+
*
321+
* If a reservation for the page existed in the reservation
322+
* map of a private mapping, the map was modified to indicate
323+
* the reservation was consumed when the page was allocated.
324+
* We clear the PagePrivate flag now so that the global
314325
* reserve count will not be incremented in free_huge_page.
315326
* The reservation map will still indicate the reservation
316327
* was consumed and possibly prevent later page allocation.
317-
* This is better than leaking a global reservation.
328+
* This is better than leaking a global reservation. If no
329+
* reservation existed, it is still safe to clear PagePrivate
330+
* as no adjustments to reservation counts were made during
331+
* allocation.
332+
*
333+
* The reservation map for shared mappings indicates which
334+
* pages have reservations. When a huge page is allocated
335+
* for an address with a reservation, no change is made to
336+
* the reserve map. In this case PagePrivate will be set
337+
* to indicate that the global reservation count should be
338+
* incremented when the page is freed. This is the desired
339+
* behavior. However, when a huge page is allocated for an
340+
* address without a reservation a reservation entry is added
341+
* to the reservation map, and PagePrivate will not be set.
342+
* When the page is freed, the global reserve count will NOT
343+
* be incremented and it will appear as though we have leaked
344+
* reserved page. In this case, set PagePrivate so that the
345+
* global reserve count will be incremented to match the
346+
* reservation map entry which was created.
347+
*
348+
* Note that vm_alloc_shared is based on the flags of the vma
349+
* for which the page was originally allocated. dst_vma could
350+
* be different or NULL on error.
318351
*/
319-
ClearPagePrivate(page);
352+
if (vm_alloc_shared)
353+
SetPagePrivate(page);
354+
else
355+
ClearPagePrivate(page);
320356
put_page(page);
321357
}
322358
BUG_ON(copied < 0);
@@ -372,8 +408,14 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
372408
dst_vma = find_vma(dst_mm, dst_start);
373409
if (!dst_vma)
374410
goto out_unlock;
375-
if (!vma_is_shmem(dst_vma) && dst_vma->vm_flags & VM_SHARED)
411+
/*
412+
* shmem_zero_setup is invoked in mmap for MAP_ANONYMOUS|MAP_SHARED but
413+
* it will overwrite vm_ops, so vma_is_anonymous must return false.
414+
*/
415+
if (WARN_ON_ONCE(vma_is_anonymous(dst_vma) &&
416+
dst_vma->vm_flags & VM_SHARED))
376417
goto out_unlock;
418+
377419
if (dst_start < dst_vma->vm_start ||
378420
dst_start + len > dst_vma->vm_end)
379421
goto out_unlock;

0 commit comments

Comments
 (0)