@@ -773,15 +773,142 @@ copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
773
773
return 0 ;
774
774
}
775
775
776
- static inline void
776
+ /*
777
+ * Copy a present and normal page if necessary.
778
+ *
779
+ * NOTE! The usual case is that this doesn't need to do
780
+ * anything, and can just return a positive value. That
781
+ * will let the caller know that it can just increase
782
+ * the page refcount and re-use the pte the traditional
783
+ * way.
784
+ *
785
+ * But _if_ we need to copy it because it needs to be
786
+ * pinned in the parent (and the child should get its own
787
+ * copy rather than just a reference to the same page),
788
+ * we'll do that here and return zero to let the caller
789
+ * know we're done.
790
+ *
791
+ * And if we need a pre-allocated page but don't yet have
792
+ * one, return a negative error to let the preallocation
793
+ * code know so that it can do so outside the page table
794
+ * lock.
795
+ */
796
+ static inline int
797
+ copy_present_page (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
798
+ pte_t * dst_pte , pte_t * src_pte ,
799
+ struct vm_area_struct * vma , struct vm_area_struct * new ,
800
+ unsigned long addr , int * rss , struct page * * prealloc ,
801
+ pte_t pte , struct page * page )
802
+ {
803
+ struct page * new_page ;
804
+
805
+ if (!is_cow_mapping (vma -> vm_flags ))
806
+ return 1 ;
807
+
808
+ /*
809
+ * The trick starts.
810
+ *
811
+ * What we want to do is to check whether this page may
812
+ * have been pinned by the parent process. If so,
813
+ * instead of wrprotect the pte on both sides, we copy
814
+ * the page immediately so that we'll always guarantee
815
+ * the pinned page won't be randomly replaced in the
816
+ * future.
817
+ *
818
+ * To achieve this, we do the following:
819
+ *
820
+ * 1. Write-protect the pte if it's writable. This is
821
+ * to protect concurrent write fast-gup with
822
+ * FOLL_PIN, so that we'll fail the fast-gup with
823
+ * the write bit removed.
824
+ *
825
+ * 2. Check page_maybe_dma_pinned() to see whether this
826
+ * page may have been pinned.
827
+ *
828
+ * The order of these steps is important to serialize
829
+ * against the fast-gup code (gup_pte_range()) on the
830
+ * pte check and try_grab_compound_head(), so that
831
+ * we'll make sure either we'll capture that fast-gup
832
+ * so we'll copy the pinned page here, or we'll fail
833
+ * that fast-gup.
834
+ *
835
+ * NOTE! Even if we don't end up copying the page,
836
+ * we won't undo this wrprotect(), because the normal
837
+ * reference copy will need it anyway.
838
+ */
839
+ if (pte_write (pte ))
840
+ ptep_set_wrprotect (src_mm , addr , src_pte );
841
+
842
+ /*
843
+ * These are the "normally we can just copy by reference"
844
+ * checks.
845
+ */
846
+ if (likely (!atomic_read (& src_mm -> has_pinned )))
847
+ return 1 ;
848
+ if (likely (!page_maybe_dma_pinned (page )))
849
+ return 1 ;
850
+
851
+ /*
852
+ * Uhhuh. It looks like the page might be a pinned page,
853
+ * and we actually need to copy it. Now we can set the
854
+ * source pte back to being writable.
855
+ */
856
+ if (pte_write (pte ))
857
+ set_pte_at (src_mm , addr , src_pte , pte );
858
+
859
+ new_page = * prealloc ;
860
+ if (!new_page )
861
+ return - EAGAIN ;
862
+
863
+ /*
864
+ * We have a prealloc page, all good! Take it
865
+ * over and copy the page & arm it.
866
+ */
867
+ * prealloc = NULL ;
868
+ copy_user_highpage (new_page , page , addr , vma );
869
+ __SetPageUptodate (new_page );
870
+ page_add_new_anon_rmap (new_page , new , addr , false);
871
+ lru_cache_add_inactive_or_unevictable (new_page , new );
872
+ rss [mm_counter (new_page )]++ ;
873
+
874
+ /* All done, just insert the new page copy in the child */
875
+ pte = mk_pte (new_page , new -> vm_page_prot );
876
+ pte = maybe_mkwrite (pte_mkdirty (pte ), new );
877
+ set_pte_at (dst_mm , addr , dst_pte , pte );
878
+ return 0 ;
879
+ }
880
+
881
+ /*
882
+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
883
+ * is required to copy this pte.
884
+ */
885
+ static inline int
777
886
copy_present_pte (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
778
887
pte_t * dst_pte , pte_t * src_pte , struct vm_area_struct * vma ,
779
- unsigned long addr , int * rss )
888
+ struct vm_area_struct * new ,
889
+ unsigned long addr , int * rss , struct page * * prealloc )
780
890
{
781
891
unsigned long vm_flags = vma -> vm_flags ;
782
892
pte_t pte = * src_pte ;
783
893
struct page * page ;
784
894
895
+ page = vm_normal_page (vma , addr , pte );
896
+ if (page ) {
897
+ int retval ;
898
+
899
+ retval = copy_present_page (dst_mm , src_mm ,
900
+ dst_pte , src_pte ,
901
+ vma , new ,
902
+ addr , rss , prealloc ,
903
+ pte , page );
904
+ if (retval <= 0 )
905
+ return retval ;
906
+
907
+ get_page (page );
908
+ page_dup_rmap (page , false);
909
+ rss [mm_counter (page )]++ ;
910
+ }
911
+
785
912
/*
786
913
* If it's a COW mapping, write protect it both
787
914
* in the parent and the child
@@ -807,14 +934,27 @@ copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
807
934
if (!(vm_flags & VM_UFFD_WP ))
808
935
pte = pte_clear_uffd_wp (pte );
809
936
810
- page = vm_normal_page (vma , addr , pte );
811
- if (page ) {
812
- get_page (page );
813
- page_dup_rmap (page , false);
814
- rss [mm_counter (page )]++ ;
937
+ set_pte_at (dst_mm , addr , dst_pte , pte );
938
+ return 0 ;
939
+ }
940
+
941
+ static inline struct page *
942
+ page_copy_prealloc (struct mm_struct * src_mm , struct vm_area_struct * vma ,
943
+ unsigned long addr )
944
+ {
945
+ struct page * new_page ;
946
+
947
+ new_page = alloc_page_vma (GFP_HIGHUSER_MOVABLE , vma , addr );
948
+ if (!new_page )
949
+ return NULL ;
950
+
951
+ if (mem_cgroup_charge (new_page , src_mm , GFP_KERNEL )) {
952
+ put_page (new_page );
953
+ return NULL ;
815
954
}
955
+ cgroup_throttle_swaprate (new_page , GFP_KERNEL );
816
956
817
- set_pte_at ( dst_mm , addr , dst_pte , pte ) ;
957
+ return new_page ;
818
958
}
819
959
820
960
static int copy_pte_range (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
@@ -825,16 +965,20 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
825
965
pte_t * orig_src_pte , * orig_dst_pte ;
826
966
pte_t * src_pte , * dst_pte ;
827
967
spinlock_t * src_ptl , * dst_ptl ;
828
- int progress = 0 ;
968
+ int progress , ret = 0 ;
829
969
int rss [NR_MM_COUNTERS ];
830
970
swp_entry_t entry = (swp_entry_t ){0 };
971
+ struct page * prealloc = NULL ;
831
972
832
973
again :
974
+ progress = 0 ;
833
975
init_rss_vec (rss );
834
976
835
977
dst_pte = pte_alloc_map_lock (dst_mm , dst_pmd , addr , & dst_ptl );
836
- if (!dst_pte )
837
- return - ENOMEM ;
978
+ if (!dst_pte ) {
979
+ ret = - ENOMEM ;
980
+ goto out ;
981
+ }
838
982
src_pte = pte_offset_map (src_pmd , addr );
839
983
src_ptl = pte_lockptr (src_mm , src_pmd );
840
984
spin_lock_nested (src_ptl , SINGLE_DEPTH_NESTING );
@@ -866,8 +1010,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
866
1010
progress += 8 ;
867
1011
continue ;
868
1012
}
869
- copy_present_pte (dst_mm , src_mm , dst_pte , src_pte ,
870
- vma , addr , rss );
1013
+ /* copy_present_pte() will clear `*prealloc' if consumed */
1014
+ ret = copy_present_pte (dst_mm , src_mm , dst_pte , src_pte ,
1015
+ vma , new , addr , rss , & prealloc );
1016
+ /*
1017
+ * If we need a pre-allocated page for this pte, drop the
1018
+ * locks, allocate, and try again.
1019
+ */
1020
+ if (unlikely (ret == - EAGAIN ))
1021
+ break ;
1022
+ if (unlikely (prealloc )) {
1023
+ /*
1024
+ * pre-alloc page cannot be reused by next time so as
1025
+ * to strictly follow mempolicy (e.g., alloc_page_vma()
1026
+ * will allocate page according to address). This
1027
+ * could only happen if one pinned pte changed.
1028
+ */
1029
+ put_page (prealloc );
1030
+ prealloc = NULL ;
1031
+ }
871
1032
progress += 8 ;
872
1033
} while (dst_pte ++ , src_pte ++ , addr += PAGE_SIZE , addr != end );
873
1034
@@ -879,13 +1040,25 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
879
1040
cond_resched ();
880
1041
881
1042
if (entry .val ) {
882
- if (add_swap_count_continuation (entry , GFP_KERNEL ) < 0 )
1043
+ if (add_swap_count_continuation (entry , GFP_KERNEL ) < 0 ) {
1044
+ ret = - ENOMEM ;
1045
+ goto out ;
1046
+ }
1047
+ entry .val = 0 ;
1048
+ } else if (ret ) {
1049
+ WARN_ON_ONCE (ret != - EAGAIN );
1050
+ prealloc = page_copy_prealloc (src_mm , vma , addr );
1051
+ if (!prealloc )
883
1052
return - ENOMEM ;
884
- progress = 0 ;
1053
+ /* We've captured and resolved the error. Reset, try again. */
1054
+ ret = 0 ;
885
1055
}
886
1056
if (addr != end )
887
1057
goto again ;
888
- return 0 ;
1058
+ out :
1059
+ if (unlikely (prealloc ))
1060
+ put_page (prealloc );
1061
+ return ret ;
889
1062
}
890
1063
891
1064
static inline int copy_pmd_range (struct mm_struct * dst_mm , struct mm_struct * src_mm ,
0 commit comments