Skip to content

Commit 64fe24a

Browse files
davidhildenbrandakpm00
authored andcommitted
mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection
Similar to our MM_CP_DIRTY_ACCT handling for shared, writable mappings, we can try mapping anonymous pages in a private writable mapping writable if they are exclusive, the PTE is already dirty, and no special handling applies. Mapping the anonymous page writable is essentially the same thing the write fault handler would do in this case. Special handling is required for uffd-wp and softdirty tracking, so take care of that properly. Also, leave PROT_NONE handling alone for now; in the future, we could similarly extend the logic in do_numa_page() or use pte_mk_savedwrite() here. While this improves mprotect(PROT_READ)+mprotect(PROT_READ|PROT_WRITE) performance, it should also be a valuable optimization for uffd-wp, when un-protecting. This has been previously suggested by Peter Collingbourne in [1], relevant in the context of the Scudo memory allocator, before we had PageAnonExclusive. This commit doesn't add the same handling for PMDs (i.e., anonymous THP, anonymous hugetlb); benchmark results from Andrea indicate that there are minor performance gains, so it's might still be valuable to streamline that logic for all anonymous pages in the future. As we now also set MM_CP_DIRTY_ACCT for private mappings, let's rename it to MM_CP_TRY_CHANGE_WRITABLE, to make it clearer what's actually happening. Micro-benchmark courtesy of Andrea: === #define _GNU_SOURCE #include <sys/mman.h> #include <stdlib.h> #include <string.h> #include <stdio.h> #include <unistd.h> #define SIZE (1024*1024*1024) int main(int argc, char *argv[]) { char *p; if (posix_memalign((void **)&p, sysconf(_SC_PAGESIZE)*512, SIZE)) perror("posix_memalign"), exit(1); if (madvise(p, SIZE, argc > 1 ? MADV_HUGEPAGE : MADV_NOHUGEPAGE)) perror("madvise"); explicit_bzero(p, SIZE); for (int loops = 0; loops < 40; loops++) { if (mprotect(p, SIZE, PROT_READ)) perror("mprotect"), exit(1); if (mprotect(p, SIZE, PROT_READ|PROT_WRITE)) perror("mprotect"), exit(1); explicit_bzero(p, SIZE); } } === Results on my Ryzen 9 3900X: Stock 10 runs (lower is better): AVG 6.398s, STDEV 0.043 Patched 10 runs (lower is better): AVG 3.780s, STDEV 0.026 === [1] https://lkml.kernel.org/r/20210429214801.2583336-1-pcc@google.com Link: https://lkml.kernel.org/r/20220614093629.76309-1-david@redhat.com Signed-off-by: David Hildenbrand <david@redhat.com> Suggested-by: Peter Collingbourne <pcc@google.com> Acked-by: Peter Xu <peterx@redhat.com> Cc: Nadav Amit <nadav.amit@gmail.com> Cc: Dave Hansen <dave.hansen@intel.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Yang Shi <shy828301@gmail.com> Cc: Hugh Dickins <hughd@google.com> Cc: Mel Gorman <mgorman@techsingularity.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 50b0f79 commit 64fe24a

File tree

2 files changed

+68
-17
lines changed

2 files changed

+68
-17
lines changed

include/linux/mm.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1962,8 +1962,12 @@ extern unsigned long move_page_tables(struct vm_area_struct *vma,
19621962
* for now all the callers are only use one of the flags at the same
19631963
* time.
19641964
*/
1965-
/* Whether we should allow dirty bit accounting */
1966-
#define MM_CP_DIRTY_ACCT (1UL << 0)
1965+
/*
1966+
* Whether we should manually check if we can map individual PTEs writable,
1967+
* because something (e.g., COW, uffd-wp) blocks that from happening for all
1968+
* PTEs automatically in a writable mapping.
1969+
*/
1970+
#define MM_CP_TRY_CHANGE_WRITABLE (1UL << 0)
19671971
/* Whether this protection change is for NUMA hints */
19681972
#define MM_CP_PROT_NUMA (1UL << 1)
19691973
/* Whether this change is for write protecting */

mm/mprotect.c

Lines changed: 62 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,39 @@
3838

3939
#include "internal.h"
4040

41+
static inline bool can_change_pte_writable(struct vm_area_struct *vma,
42+
unsigned long addr, pte_t pte)
43+
{
44+
struct page *page;
45+
46+
VM_BUG_ON(!(vma->vm_flags & VM_WRITE) || pte_write(pte));
47+
48+
if (pte_protnone(pte) || !pte_dirty(pte))
49+
return false;
50+
51+
/* Do we need write faults for softdirty tracking? */
52+
if ((vma->vm_flags & VM_SOFTDIRTY) && !pte_soft_dirty(pte))
53+
return false;
54+
55+
/* Do we need write faults for uffd-wp tracking? */
56+
if (userfaultfd_pte_wp(vma, pte))
57+
return false;
58+
59+
if (!(vma->vm_flags & VM_SHARED)) {
60+
/*
61+
* We can only special-case on exclusive anonymous pages,
62+
* because we know that our write-fault handler similarly would
63+
* map them writable without any additional checks while holding
64+
* the PT lock.
65+
*/
66+
page = vm_normal_page(vma, addr, pte);
67+
if (!page || !PageAnon(page) || !PageAnonExclusive(page))
68+
return false;
69+
}
70+
71+
return true;
72+
}
73+
4174
static unsigned long change_pte_range(struct mmu_gather *tlb,
4275
struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr,
4376
unsigned long end, pgprot_t newprot, unsigned long cp_flags)
@@ -46,7 +79,6 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
4679
spinlock_t *ptl;
4780
unsigned long pages = 0;
4881
int target_node = NUMA_NO_NODE;
49-
bool dirty_accountable = cp_flags & MM_CP_DIRTY_ACCT;
5082
bool prot_numa = cp_flags & MM_CP_PROT_NUMA;
5183
bool uffd_wp = cp_flags & MM_CP_UFFD_WP;
5284
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
@@ -137,21 +169,27 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
137169
ptent = pte_wrprotect(ptent);
138170
ptent = pte_mkuffd_wp(ptent);
139171
} else if (uffd_wp_resolve) {
140-
/*
141-
* Leave the write bit to be handled
142-
* by PF interrupt handler, then
143-
* things like COW could be properly
144-
* handled.
145-
*/
146172
ptent = pte_clear_uffd_wp(ptent);
147173
}
148174

149-
/* Avoid taking write faults for known dirty pages */
150-
if (dirty_accountable && pte_dirty(ptent) &&
151-
(pte_soft_dirty(ptent) ||
152-
!(vma->vm_flags & VM_SOFTDIRTY))) {
175+
/*
176+
* In some writable, shared mappings, we might want
177+
* to catch actual write access -- see
178+
* vma_wants_writenotify().
179+
*
180+
* In all writable, private mappings, we have to
181+
* properly handle COW.
182+
*
183+
* In both cases, we can sometimes still change PTEs
184+
* writable and avoid the write-fault handler, for
185+
* example, if a PTE is already dirty and no other
186+
* COW or special handling is required.
187+
*/
188+
if ((cp_flags & MM_CP_TRY_CHANGE_WRITABLE) &&
189+
!pte_write(ptent) &&
190+
can_change_pte_writable(vma, addr, ptent))
153191
ptent = pte_mkwrite(ptent);
154-
}
192+
155193
ptep_modify_prot_commit(vma, addr, pte, oldpte, ptent);
156194
if (pte_needs_flush(oldpte, ptent))
157195
tlb_flush_pte_range(tlb, addr, PAGE_SIZE);
@@ -505,9 +543,9 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
505543
unsigned long oldflags = vma->vm_flags;
506544
long nrpages = (end - start) >> PAGE_SHIFT;
507545
unsigned long charged = 0;
546+
bool try_change_writable;
508547
pgoff_t pgoff;
509548
int error;
510-
int dirty_accountable = 0;
511549

512550
if (newflags == oldflags) {
513551
*pprev = vma;
@@ -583,11 +621,20 @@ mprotect_fixup(struct mmu_gather *tlb, struct vm_area_struct *vma,
583621
* held in write mode.
584622
*/
585623
vma->vm_flags = newflags;
586-
dirty_accountable = vma_wants_writenotify(vma, vma->vm_page_prot);
624+
/*
625+
* We want to check manually if we can change individual PTEs writable
626+
* if we can't do that automatically for all PTEs in a mapping. For
627+
* private mappings, that's always the case when we have write
628+
* permissions as we properly have to handle COW.
629+
*/
630+
if (vma->vm_flags & VM_SHARED)
631+
try_change_writable = vma_wants_writenotify(vma, vma->vm_page_prot);
632+
else
633+
try_change_writable = !!(vma->vm_flags & VM_WRITE);
587634
vma_set_page_prot(vma);
588635

589636
change_protection(tlb, vma, start, end, vma->vm_page_prot,
590-
dirty_accountable ? MM_CP_DIRTY_ACCT : 0);
637+
try_change_writable ? MM_CP_TRY_CHANGE_WRITABLE : 0);
591638

592639
/*
593640
* Private VM_LOCKED VMA becoming writable: trigger COW to avoid major

0 commit comments

Comments
 (0)