Skip to content

Commit 70e806e

Browse files
xzpetertorvalds
authored andcommitted
mm: Do early cow for pinned pages during fork() for ptes
This allows copy_pte_range() to do early cow if the pages were pinned on the source mm. Currently we don't have an accurate way to know whether a page is pinned or not. The only thing we have is page_maybe_dma_pinned(). However that's good enough for now. Especially, with the newly added mm->has_pinned flag to make sure we won't affect processes that never pinned any pages. It would be easier if we can do GFP_KERNEL allocation within copy_one_pte(). Unluckily, we can't because we're with the page table locks held for both the parent and child processes. So the page allocation needs to be done outside copy_one_pte(). Some trick is there in copy_present_pte(), majorly the wrprotect trick to block concurrent fast-gup. Comments in the function should explain better in place. Oleg Nesterov reported a (probably harmless) bug during review that we didn't reset entry.val properly in copy_pte_range() so that potentially there's chance to call add_swap_count_continuation() multiple times on the same swp entry. However that should be harmless since even if it happens, the same function (add_swap_count_continuation()) will return directly noticing that there're enough space for the swp counter. So instead of a standalone stable patch, it is touched up in this patch directly. Link: https://lore.kernel.org/lkml/20200914143829.GA1424636@nvidia.com/ Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Peter Xu <peterx@redhat.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 7a4830c commit 70e806e

File tree

1 file changed

+189
-16
lines changed

1 file changed

+189
-16
lines changed

mm/memory.c

Lines changed: 189 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
773773
return 0;
774774
}
775775

776-
static inline void
776+
/*
777+
* Copy a present and normal page if necessary.
778+
*
779+
* NOTE! The usual case is that this doesn't need to do
780+
* anything, and can just return a positive value. That
781+
* will let the caller know that it can just increase
782+
* the page refcount and re-use the pte the traditional
783+
* way.
784+
*
785+
* But _if_ we need to copy it because it needs to be
786+
* pinned in the parent (and the child should get its own
787+
* copy rather than just a reference to the same page),
788+
* we'll do that here and return zero to let the caller
789+
* know we're done.
790+
*
791+
* And if we need a pre-allocated page but don't yet have
792+
* one, return a negative error to let the preallocation
793+
* code know so that it can do so outside the page table
794+
* lock.
795+
*/
796+
static inline int
797+
copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
798+
pte_t *dst_pte, pte_t *src_pte,
799+
struct vm_area_struct *vma, struct vm_area_struct *new,
800+
unsigned long addr, int *rss, struct page **prealloc,
801+
pte_t pte, struct page *page)
802+
{
803+
struct page *new_page;
804+
805+
if (!is_cow_mapping(vma->vm_flags))
806+
return 1;
807+
808+
/*
809+
* The trick starts.
810+
*
811+
* What we want to do is to check whether this page may
812+
* have been pinned by the parent process. If so,
813+
* instead of wrprotect the pte on both sides, we copy
814+
* the page immediately so that we'll always guarantee
815+
* the pinned page won't be randomly replaced in the
816+
* future.
817+
*
818+
* To achieve this, we do the following:
819+
*
820+
* 1. Write-protect the pte if it's writable. This is
821+
* to protect concurrent write fast-gup with
822+
* FOLL_PIN, so that we'll fail the fast-gup with
823+
* the write bit removed.
824+
*
825+
* 2. Check page_maybe_dma_pinned() to see whether this
826+
* page may have been pinned.
827+
*
828+
* The order of these steps is important to serialize
829+
* against the fast-gup code (gup_pte_range()) on the
830+
* pte check and try_grab_compound_head(), so that
831+
* we'll make sure either we'll capture that fast-gup
832+
* so we'll copy the pinned page here, or we'll fail
833+
* that fast-gup.
834+
*
835+
* NOTE! Even if we don't end up copying the page,
836+
* we won't undo this wrprotect(), because the normal
837+
* reference copy will need it anyway.
838+
*/
839+
if (pte_write(pte))
840+
ptep_set_wrprotect(src_mm, addr, src_pte);
841+
842+
/*
843+
* These are the "normally we can just copy by reference"
844+
* checks.
845+
*/
846+
if (likely(!atomic_read(&src_mm->has_pinned)))
847+
return 1;
848+
if (likely(!page_maybe_dma_pinned(page)))
849+
return 1;
850+
851+
/*
852+
* Uhhuh. It looks like the page might be a pinned page,
853+
* and we actually need to copy it. Now we can set the
854+
* source pte back to being writable.
855+
*/
856+
if (pte_write(pte))
857+
set_pte_at(src_mm, addr, src_pte, pte);
858+
859+
new_page = *prealloc;
860+
if (!new_page)
861+
return -EAGAIN;
862+
863+
/*
864+
* We have a prealloc page, all good! Take it
865+
* over and copy the page & arm it.
866+
*/
867+
*prealloc = NULL;
868+
copy_user_highpage(new_page, page, addr, vma);
869+
__SetPageUptodate(new_page);
870+
page_add_new_anon_rmap(new_page, new, addr, false);
871+
lru_cache_add_inactive_or_unevictable(new_page, new);
872+
rss[mm_counter(new_page)]++;
873+
874+
/* All done, just insert the new page copy in the child */
875+
pte = mk_pte(new_page, new->vm_page_prot);
876+
pte = maybe_mkwrite(pte_mkdirty(pte), new);
877+
set_pte_at(dst_mm, addr, dst_pte, pte);
878+
return 0;
879+
}
880+
881+
/*
882+
* Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
883+
* is required to copy this pte.
884+
*/
885+
static inline int
777886
copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
778887
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
779-
unsigned long addr, int *rss)
888+
struct vm_area_struct *new,
889+
unsigned long addr, int *rss, struct page **prealloc)
780890
{
781891
unsigned long vm_flags = vma->vm_flags;
782892
pte_t pte = *src_pte;
783893
struct page *page;
784894

895+
page = vm_normal_page(vma, addr, pte);
896+
if (page) {
897+
int retval;
898+
899+
retval = copy_present_page(dst_mm, src_mm,
900+
dst_pte, src_pte,
901+
vma, new,
902+
addr, rss, prealloc,
903+
pte, page);
904+
if (retval <= 0)
905+
return retval;
906+
907+
get_page(page);
908+
page_dup_rmap(page, false);
909+
rss[mm_counter(page)]++;
910+
}
911+
785912
/*
786913
* If it's a COW mapping, write protect it both
787914
* in the parent and the child
@@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807934
if (!(vm_flags & VM_UFFD_WP))
808935
pte = pte_clear_uffd_wp(pte);
809936

810-
page = vm_normal_page(vma, addr, pte);
811-
if (page) {
812-
get_page(page);
813-
page_dup_rmap(page, false);
814-
rss[mm_counter(page)]++;
937+
set_pte_at(dst_mm, addr, dst_pte, pte);
938+
return 0;
939+
}
940+
941+
static inline struct page *
942+
page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
943+
unsigned long addr)
944+
{
945+
struct page *new_page;
946+
947+
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
948+
if (!new_page)
949+
return NULL;
950+
951+
if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
952+
put_page(new_page);
953+
return NULL;
815954
}
955+
cgroup_throttle_swaprate(new_page, GFP_KERNEL);
816956

817-
set_pte_at(dst_mm, addr, dst_pte, pte);
957+
return new_page;
818958
}
819959

820960
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
825965
pte_t *orig_src_pte, *orig_dst_pte;
826966
pte_t *src_pte, *dst_pte;
827967
spinlock_t *src_ptl, *dst_ptl;
828-
int progress = 0;
968+
int progress, ret = 0;
829969
int rss[NR_MM_COUNTERS];
830970
swp_entry_t entry = (swp_entry_t){0};
971+
struct page *prealloc = NULL;
831972

832973
again:
974+
progress = 0;
833975
init_rss_vec(rss);
834976

835977
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
836-
if (!dst_pte)
837-
return -ENOMEM;
978+
if (!dst_pte) {
979+
ret = -ENOMEM;
980+
goto out;
981+
}
838982
src_pte = pte_offset_map(src_pmd, addr);
839983
src_ptl = pte_lockptr(src_mm, src_pmd);
840984
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
8661010
progress += 8;
8671011
continue;
8681012
}
869-
copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
870-
vma, addr, rss);
1013+
/* copy_present_pte() will clear `*prealloc' if consumed */
1014+
ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
1015+
vma, new, addr, rss, &prealloc);
1016+
/*
1017+
* If we need a pre-allocated page for this pte, drop the
1018+
* locks, allocate, and try again.
1019+
*/
1020+
if (unlikely(ret == -EAGAIN))
1021+
break;
1022+
if (unlikely(prealloc)) {
1023+
/*
1024+
* pre-alloc page cannot be reused by next time so as
1025+
* to strictly follow mempolicy (e.g., alloc_page_vma()
1026+
* will allocate page according to address). This
1027+
* could only happen if one pinned pte changed.
1028+
*/
1029+
put_page(prealloc);
1030+
prealloc = NULL;
1031+
}
8711032
progress += 8;
8721033
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
8731034

@@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
8791040
cond_resched();
8801041

8811042
if (entry.val) {
882-
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
1043+
if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
1044+
ret = -ENOMEM;
1045+
goto out;
1046+
}
1047+
entry.val = 0;
1048+
} else if (ret) {
1049+
WARN_ON_ONCE(ret != -EAGAIN);
1050+
prealloc = page_copy_prealloc(src_mm, vma, addr);
1051+
if (!prealloc)
8831052
return -ENOMEM;
884-
progress = 0;
1053+
/* We've captured and resolved the error. Reset, try again. */
1054+
ret = 0;
8851055
}
8861056
if (addr != end)
8871057
goto again;
888-
return 0;
1058+
out:
1059+
if (unlikely(prealloc))
1060+
put_page(prealloc);
1061+
return ret;
8891062
}
8901063

8911064
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,

0 commit comments

Comments
 (0)