Skip to content

Commit f619147

Browse files
CmdrMoozytorvalds
authored andcommitted
userfaultfd: add UFFDIO_CONTINUE ioctl
This ioctl is how userspace ought to resolve "minor" userfaults. The idea is, userspace is notified that a minor fault has occurred. It might change the contents of the page using its second non-UFFD mapping, or not. Then, it calls UFFDIO_CONTINUE to tell the kernel "I have ensured the page contents are correct, carry on setting up the mapping". Note that it doesn't make much sense to use UFFDIO_{COPY,ZEROPAGE} for MINOR registered VMAs. ZEROPAGE maps the VMA to the zero page; but in the minor fault case, we already have some pre-existing underlying page. Likewise, UFFDIO_COPY isn't useful if we have a second non-UFFD mapping. We'd just use memcpy() or similar instead. It turns out hugetlb_mcopy_atomic_pte() already does very close to what we want, if an existing page is provided via `struct page **pagep`. We already special-case the behavior a bit for the UFFDIO_ZEROPAGE case, so just extend that design: add an enum for the three modes of operation, and make the small adjustments needed for the MCOPY_ATOMIC_CONTINUE case. (Basically, look up the existing page, and avoid adding the existing page to the page cache or calling set_page_huge_active() on it.) Link: https://lkml.kernel.org/r/20210301222728.176417-5-axelrasmussen@google.com Signed-off-by: Axel Rasmussen <axelrasmussen@google.com> Reviewed-by: Peter Xu <peterx@redhat.com> Cc: Adam Ruprecht <ruprecht@google.com> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Anshuman Khandual <anshuman.khandual@arm.com> Cc: Cannon Matthews <cannonmatthews@google.com> Cc: Catalin Marinas <catalin.marinas@arm.com> Cc: Chinwen Chang <chinwen.chang@mediatek.com> Cc: David Rientjes <rientjes@google.com> Cc: "Dr . David Alan Gilbert" <dgilbert@redhat.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Ingo Molnar <mingo@redhat.com> Cc: Jann Horn <jannh@google.com> Cc: Jerome Glisse <jglisse@redhat.com> Cc: Kirill A. Shutemov <kirill@shutemov.name> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Michael Ellerman <mpe@ellerman.id.au> Cc: "Michal Koutn" <mkoutny@suse.com> Cc: Michel Lespinasse <walken@google.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Mike Rapoport <rppt@linux.vnet.ibm.com> Cc: Mina Almasry <almasrymina@google.com> Cc: Nicholas Piggin <npiggin@gmail.com> Cc: Oliver Upton <oupton@google.com> Cc: Shaohua Li <shli@fb.com> Cc: Shawn Anastasio <shawn@anastas.io> Cc: Steven Price <steven.price@arm.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 714c189 commit f619147

File tree

6 files changed

+156
-30
lines changed

6 files changed

+156
-30
lines changed

fs/userfaultfd.c

+67
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
14871487
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_WP))
14881488
ioctls_out &= ~((__u64)1 << _UFFDIO_WRITEPROTECT);
14891489

1490+
/* CONTINUE ioctl is only supported for MINOR ranges. */
1491+
if (!(uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR))
1492+
ioctls_out &= ~((__u64)1 << _UFFDIO_CONTINUE);
1493+
14901494
/*
14911495
* Now that we scanned all vmas we can already tell
14921496
* userland which ioctls methods are guaranteed to
@@ -1840,6 +1844,66 @@ static int userfaultfd_writeprotect(struct userfaultfd_ctx *ctx,
18401844
return ret;
18411845
}
18421846

1847+
static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg)
1848+
{
1849+
__s64 ret;
1850+
struct uffdio_continue uffdio_continue;
1851+
struct uffdio_continue __user *user_uffdio_continue;
1852+
struct userfaultfd_wake_range range;
1853+
1854+
user_uffdio_continue = (struct uffdio_continue __user *)arg;
1855+
1856+
ret = -EAGAIN;
1857+
if (READ_ONCE(ctx->mmap_changing))
1858+
goto out;
1859+
1860+
ret = -EFAULT;
1861+
if (copy_from_user(&uffdio_continue, user_uffdio_continue,
1862+
/* don't copy the output fields */
1863+
sizeof(uffdio_continue) - (sizeof(__s64))))
1864+
goto out;
1865+
1866+
ret = validate_range(ctx->mm, &uffdio_continue.range.start,
1867+
uffdio_continue.range.len);
1868+
if (ret)
1869+
goto out;
1870+
1871+
ret = -EINVAL;
1872+
/* double check for wraparound just in case. */
1873+
if (uffdio_continue.range.start + uffdio_continue.range.len <=
1874+
uffdio_continue.range.start) {
1875+
goto out;
1876+
}
1877+
if (uffdio_continue.mode & ~UFFDIO_CONTINUE_MODE_DONTWAKE)
1878+
goto out;
1879+
1880+
if (mmget_not_zero(ctx->mm)) {
1881+
ret = mcopy_continue(ctx->mm, uffdio_continue.range.start,
1882+
uffdio_continue.range.len,
1883+
&ctx->mmap_changing);
1884+
mmput(ctx->mm);
1885+
} else {
1886+
return -ESRCH;
1887+
}
1888+
1889+
if (unlikely(put_user(ret, &user_uffdio_continue->mapped)))
1890+
return -EFAULT;
1891+
if (ret < 0)
1892+
goto out;
1893+
1894+
/* len == 0 would wake all */
1895+
BUG_ON(!ret);
1896+
range.len = ret;
1897+
if (!(uffdio_continue.mode & UFFDIO_CONTINUE_MODE_DONTWAKE)) {
1898+
range.start = uffdio_continue.range.start;
1899+
wake_userfault(ctx, &range);
1900+
}
1901+
ret = range.len == uffdio_continue.range.len ? 0 : -EAGAIN;
1902+
1903+
out:
1904+
return ret;
1905+
}
1906+
18431907
static inline unsigned int uffd_ctx_features(__u64 user_features)
18441908
{
18451909
/*
@@ -1927,6 +1991,9 @@ static long userfaultfd_ioctl(struct file *file, unsigned cmd,
19271991
case UFFDIO_WRITEPROTECT:
19281992
ret = userfaultfd_writeprotect(ctx, arg);
19291993
break;
1994+
case UFFDIO_CONTINUE:
1995+
ret = userfaultfd_continue(ctx, arg);
1996+
break;
19301997
}
19311998
return ret;
19321999
}

include/linux/hugetlb.h

+3
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
#include <linux/kref.h>
1212
#include <linux/pgtable.h>
1313
#include <linux/gfp.h>
14+
#include <linux/userfaultfd_k.h>
1415

1516
struct ctl_table;
1617
struct user_struct;
@@ -139,6 +140,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, pte_t *dst_pte,
139140
struct vm_area_struct *dst_vma,
140141
unsigned long dst_addr,
141142
unsigned long src_addr,
143+
enum mcopy_atomic_mode mode,
142144
struct page **pagep);
143145
#endif /* CONFIG_USERFAULTFD */
144146
bool hugetlb_reserve_pages(struct inode *inode, long from, long to,
@@ -318,6 +320,7 @@ static inline int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
318320
struct vm_area_struct *dst_vma,
319321
unsigned long dst_addr,
320322
unsigned long src_addr,
323+
enum mcopy_atomic_mode mode,
321324
struct page **pagep)
322325
{
323326
BUG();

include/linux/userfaultfd_k.h

+18
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,31 @@ extern int sysctl_unprivileged_userfaultfd;
3737

3838
extern vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason);
3939

40+
/*
41+
* The mode of operation for __mcopy_atomic and its helpers.
42+
*
43+
* This is almost an implementation detail (mcopy_atomic below doesn't take this
44+
* as a parameter), but it's exposed here because memory-kind-specific
45+
* implementations (e.g. hugetlbfs) need to know the mode of operation.
46+
*/
47+
enum mcopy_atomic_mode {
48+
/* A normal copy_from_user into the destination range. */
49+
MCOPY_ATOMIC_NORMAL,
50+
/* Don't copy; map the destination range to the zero page. */
51+
MCOPY_ATOMIC_ZEROPAGE,
52+
/* Just install pte(s) with the existing page(s) in the page cache. */
53+
MCOPY_ATOMIC_CONTINUE,
54+
};
55+
4056
extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
4157
unsigned long src_start, unsigned long len,
4258
bool *mmap_changing, __u64 mode);
4359
extern ssize_t mfill_zeropage(struct mm_struct *dst_mm,
4460
unsigned long dst_start,
4561
unsigned long len,
4662
bool *mmap_changing);
63+
extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start,
64+
unsigned long len, bool *mmap_changing);
4765
extern int mwriteprotect_range(struct mm_struct *dst_mm,
4866
unsigned long start, unsigned long len,
4967
bool enable_wp, bool *mmap_changing);

include/uapi/linux/userfaultfd.h

+19-2
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,12 @@
4040
((__u64)1 << _UFFDIO_WAKE | \
4141
(__u64)1 << _UFFDIO_COPY | \
4242
(__u64)1 << _UFFDIO_ZEROPAGE | \
43-
(__u64)1 << _UFFDIO_WRITEPROTECT)
43+
(__u64)1 << _UFFDIO_WRITEPROTECT | \
44+
(__u64)1 << _UFFDIO_CONTINUE)
4445
#define UFFD_API_RANGE_IOCTLS_BASIC \
4546
((__u64)1 << _UFFDIO_WAKE | \
46-
(__u64)1 << _UFFDIO_COPY)
47+
(__u64)1 << _UFFDIO_COPY | \
48+
(__u64)1 << _UFFDIO_CONTINUE)
4749

4850
/*
4951
* Valid ioctl command number range with this API is from 0x00 to
@@ -59,6 +61,7 @@
5961
#define _UFFDIO_COPY (0x03)
6062
#define _UFFDIO_ZEROPAGE (0x04)
6163
#define _UFFDIO_WRITEPROTECT (0x06)
64+
#define _UFFDIO_CONTINUE (0x07)
6265
#define _UFFDIO_API (0x3F)
6366

6467
/* userfaultfd ioctl ids */
@@ -77,6 +80,8 @@
7780
struct uffdio_zeropage)
7881
#define UFFDIO_WRITEPROTECT _IOWR(UFFDIO, _UFFDIO_WRITEPROTECT, \
7982
struct uffdio_writeprotect)
83+
#define UFFDIO_CONTINUE _IOR(UFFDIO, _UFFDIO_CONTINUE, \
84+
struct uffdio_continue)
8085

8186
/* read() structure */
8287
struct uffd_msg {
@@ -268,6 +273,18 @@ struct uffdio_writeprotect {
268273
__u64 mode;
269274
};
270275

276+
struct uffdio_continue {
277+
struct uffdio_range range;
278+
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((__u64)1<<0)
279+
__u64 mode;
280+
281+
/*
282+
* Fields below here are written by the ioctl and must be at the end:
283+
* the copy_from_user will not read past here.
284+
*/
285+
__s64 mapped;
286+
};
287+
271288
/*
272289
* Flags for the userfaultfd(2) system call itself.
273290
*/

mm/hugetlb.c

+26-14
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,6 @@
3939
#include <linux/hugetlb.h>
4040
#include <linux/hugetlb_cgroup.h>
4141
#include <linux/node.h>
42-
#include <linux/userfaultfd_k.h>
4342
#include <linux/page_owner.h>
4443
#include "internal.h"
4544

@@ -4865,8 +4864,10 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
48654864
struct vm_area_struct *dst_vma,
48664865
unsigned long dst_addr,
48674866
unsigned long src_addr,
4867+
enum mcopy_atomic_mode mode,
48684868
struct page **pagep)
48694869
{
4870+
bool is_continue = (mode == MCOPY_ATOMIC_CONTINUE);
48704871
struct address_space *mapping;
48714872
pgoff_t idx;
48724873
unsigned long size;
@@ -4876,8 +4877,17 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
48764877
spinlock_t *ptl;
48774878
int ret;
48784879
struct page *page;
4880+
int writable;
48794881

4880-
if (!*pagep) {
4882+
mapping = dst_vma->vm_file->f_mapping;
4883+
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4884+
4885+
if (is_continue) {
4886+
ret = -EFAULT;
4887+
page = find_lock_page(mapping, idx);
4888+
if (!page)
4889+
goto out;
4890+
} else if (!*pagep) {
48814891
ret = -ENOMEM;
48824892
page = alloc_huge_page(dst_vma, dst_addr, 0);
48834893
if (IS_ERR(page))
@@ -4906,13 +4916,8 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
49064916
*/
49074917
__SetPageUptodate(page);
49084918

4909-
mapping = dst_vma->vm_file->f_mapping;
4910-
idx = vma_hugecache_offset(h, dst_vma, dst_addr);
4911-
4912-
/*
4913-
* If shared, add to page cache
4914-
*/
4915-
if (vm_shared) {
4919+
/* Add shared, newly allocated pages to the page cache. */
4920+
if (vm_shared && !is_continue) {
49164921
size = i_size_read(mapping->host) >> huge_page_shift(h);
49174922
ret = -EFAULT;
49184923
if (idx >= size)
@@ -4957,8 +4962,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
49574962
hugepage_add_new_anon_rmap(page, dst_vma, dst_addr);
49584963
}
49594964

4960-
_dst_pte = make_huge_pte(dst_vma, page, dst_vma->vm_flags & VM_WRITE);
4961-
if (dst_vma->vm_flags & VM_WRITE)
4965+
/* For CONTINUE on a non-shared VMA, don't set VM_WRITE for CoW. */
4966+
if (is_continue && !vm_shared)
4967+
writable = 0;
4968+
else
4969+
writable = dst_vma->vm_flags & VM_WRITE;
4970+
4971+
_dst_pte = make_huge_pte(dst_vma, page, writable);
4972+
if (writable)
49624973
_dst_pte = huge_pte_mkdirty(_dst_pte);
49634974
_dst_pte = pte_mkyoung(_dst_pte);
49644975

@@ -4972,15 +4983,16 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
49724983
update_mmu_cache(dst_vma, dst_addr, dst_pte);
49734984

49744985
spin_unlock(ptl);
4975-
SetHPageMigratable(page);
4976-
if (vm_shared)
4986+
if (!is_continue)
4987+
SetHPageMigratable(page);
4988+
if (vm_shared || is_continue)
49774989
unlock_page(page);
49784990
ret = 0;
49794991
out:
49804992
return ret;
49814993
out_release_unlock:
49824994
spin_unlock(ptl);
4983-
if (vm_shared)
4995+
if (vm_shared || is_continue)
49844996
unlock_page(page);
49854997
out_release_nounlock:
49864998
put_page(page);

mm/userfaultfd.c

+23-14
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
207207
unsigned long dst_start,
208208
unsigned long src_start,
209209
unsigned long len,
210-
bool zeropage)
210+
enum mcopy_atomic_mode mode)
211211
{
212212
int vm_alloc_shared = dst_vma->vm_flags & VM_SHARED;
213213
int vm_shared = dst_vma->vm_flags & VM_SHARED;
@@ -227,7 +227,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
227227
* by THP. Since we can not reliably insert a zero page, this
228228
* feature is not supported.
229229
*/
230-
if (zeropage) {
230+
if (mode == MCOPY_ATOMIC_ZEROPAGE) {
231231
mmap_read_unlock(dst_mm);
232232
return -EINVAL;
233233
}
@@ -273,8 +273,6 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
273273
}
274274

275275
while (src_addr < src_start + len) {
276-
pte_t dst_pteval;
277-
278276
BUG_ON(dst_addr >= dst_start + len);
279277

280278
/*
@@ -297,16 +295,16 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
297295
goto out_unlock;
298296
}
299297

300-
err = -EEXIST;
301-
dst_pteval = huge_ptep_get(dst_pte);
302-
if (!huge_pte_none(dst_pteval)) {
298+
if (mode != MCOPY_ATOMIC_CONTINUE &&
299+
!huge_pte_none(huge_ptep_get(dst_pte))) {
300+
err = -EEXIST;
303301
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
304302
i_mmap_unlock_read(mapping);
305303
goto out_unlock;
306304
}
307305

308306
err = hugetlb_mcopy_atomic_pte(dst_mm, dst_pte, dst_vma,
309-
dst_addr, src_addr, &page);
307+
dst_addr, src_addr, mode, &page);
310308

311309
mutex_unlock(&hugetlb_fault_mutex_table[hash]);
312310
i_mmap_unlock_read(mapping);
@@ -408,7 +406,7 @@ extern ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
408406
unsigned long dst_start,
409407
unsigned long src_start,
410408
unsigned long len,
411-
bool zeropage);
409+
enum mcopy_atomic_mode mode);
412410
#endif /* CONFIG_HUGETLB_PAGE */
413411

414412
static __always_inline ssize_t mfill_atomic_pte(struct mm_struct *dst_mm,
@@ -458,7 +456,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
458456
unsigned long dst_start,
459457
unsigned long src_start,
460458
unsigned long len,
461-
bool zeropage,
459+
enum mcopy_atomic_mode mcopy_mode,
462460
bool *mmap_changing,
463461
__u64 mode)
464462
{
@@ -469,6 +467,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
469467
long copied;
470468
struct page *page;
471469
bool wp_copy;
470+
bool zeropage = (mcopy_mode == MCOPY_ATOMIC_ZEROPAGE);
472471

473472
/*
474473
* Sanitize the command parameters:
@@ -527,10 +526,12 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
527526
*/
528527
if (is_vm_hugetlb_page(dst_vma))
529528
return __mcopy_atomic_hugetlb(dst_mm, dst_vma, dst_start,
530-
src_start, len, zeropage);
529+
src_start, len, mcopy_mode);
531530

532531
if (!vma_is_anonymous(dst_vma) && !vma_is_shmem(dst_vma))
533532
goto out_unlock;
533+
if (mcopy_mode == MCOPY_ATOMIC_CONTINUE)
534+
goto out_unlock;
534535

535536
/*
536537
* Ensure the dst_vma has a anon_vma or this page
@@ -626,14 +627,22 @@ ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start,
626627
unsigned long src_start, unsigned long len,
627628
bool *mmap_changing, __u64 mode)
628629
{
629-
return __mcopy_atomic(dst_mm, dst_start, src_start, len, false,
630-
mmap_changing, mode);
630+
return __mcopy_atomic(dst_mm, dst_start, src_start, len,
631+
MCOPY_ATOMIC_NORMAL, mmap_changing, mode);
631632
}
632633

633634
ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start,
634635
unsigned long len, bool *mmap_changing)
635636
{
636-
return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing, 0);
637+
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_ZEROPAGE,
638+
mmap_changing, 0);
639+
}
640+
641+
ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start,
642+
unsigned long len, bool *mmap_changing)
643+
{
644+
return __mcopy_atomic(dst_mm, start, 0, len, MCOPY_ATOMIC_CONTINUE,
645+
mmap_changing, 0);
637646
}
638647

639648
int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,

0 commit comments

Comments
 (0)