Skip to content

Commit

Permalink
mm, hugepages: add mremap() support for hugepage backed vma
Browse files Browse the repository at this point in the history
Support mremap() for hugepage backed vma segment by simply repositioning
page table entries.  The page table entries are repositioned to the new
virtual address on mremap().

Hugetlb mremap() support is of course generic; my motivating use case is
a library (hugepage_text), which reloads the ELF text of executables in
hugepages.  This significantly increases the execution performance of
said executables.

Restrict the mremap operation on hugepages to up to the size of the
original mapping as the underlying hugetlb reservation is not yet
capable of handling remapping to a larger size.

During the mremap() operation we detect pmd_share'd mappings and we
unshare those during the mremap().  On access and fault the sharing is
established again.

Link: https://lkml.kernel.org/r/20211013195825.3058275-1-almasrymina@google.com
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
Cc: Ken Chen <kenchen@google.com>
Cc: Chris Kennelly <ckennelly@google.com>
Cc: Michal Hocko <mhocko@suse.com>
Cc: Vlastimil Babka <vbabka@suse.cz>
Cc: Kirill Shutemov <kirill@shutemov.name>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
mina authored and torvalds committed Nov 6, 2021
1 parent bd3400e commit 550a7d6
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 9 deletions.
19 changes: 19 additions & 0 deletions include/linux/hugetlb.h
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages,
void hugepage_put_subpool(struct hugepage_subpool *spool);

void reset_vma_resv_huge_pages(struct vm_area_struct *vma);
void clear_vma_resv_huge_pages(struct vm_area_struct *vma);
int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *);
int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);
Expand All @@ -132,6 +133,10 @@ int hugetlb_treat_movable_handler(struct ctl_table *, int, void *, size_t *,
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *, int, void *, size_t *,
loff_t *);

int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr, unsigned long new_addr,
unsigned long len);
int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
long follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *,
struct page **, struct vm_area_struct **,
Expand Down Expand Up @@ -215,6 +220,10 @@ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
}

static inline unsigned long hugetlb_total_pages(void)
{
return 0;
Expand Down Expand Up @@ -262,6 +271,16 @@ static inline int copy_hugetlb_page_range(struct mm_struct *dst,
return 0;
}

static inline int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr,
unsigned long len)
{
BUG();
return 0;
}

static inline void hugetlb_report_meminfo(struct seq_file *m)
{
}
Expand Down
111 changes: 105 additions & 6 deletions mm/hugetlb.c
Original file line number Diff line number Diff line change
Expand Up @@ -1014,6 +1014,35 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma)
vma->vm_private_data = (void *)0;
}

/*
* Reset and decrement one ref on hugepage private reservation.
* Called with mm->mmap_sem writer semaphore held.
* This function should be only used by move_vma() and operate on
* same sized vma. It should never come here with last ref on the
* reservation.
*/
void clear_vma_resv_huge_pages(struct vm_area_struct *vma)
{
/*
* Clear the old hugetlb private page reservation.
* It has already been transferred to new_vma.
*
* During a mremap() operation of a hugetlb vma we call move_vma()
* which copies vma into new_vma and unmaps vma. After the copy
* operation both new_vma and vma share a reference to the resv_map
* struct, and at that point vma is about to be unmapped. We don't
* want to return the reservation to the pool at unmap of vma because
* the reservation still lives on in new_vma, so simply decrement the
* ref here and remove the resv_map reference from this vma.
*/
struct resv_map *reservations = vma_resv_map(vma);

if (reservations && is_vma_resv_set(vma, HPAGE_RESV_OWNER))
kref_put(&reservations->refs, resv_map_release);

reset_vma_resv_huge_pages(vma);
}

/* Returns true if the VMA has associated reserve pages */
static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
{
Expand Down Expand Up @@ -4718,6 +4747,82 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
return ret;
}

static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr,
unsigned long new_addr, pte_t *src_pte)
{
struct hstate *h = hstate_vma(vma);
struct mm_struct *mm = vma->vm_mm;
pte_t *dst_pte, pte;
spinlock_t *src_ptl, *dst_ptl;

dst_pte = huge_pte_offset(mm, new_addr, huge_page_size(h));
dst_ptl = huge_pte_lock(h, mm, dst_pte);
src_ptl = huge_pte_lockptr(h, mm, src_pte);

/*
* We don't have to worry about the ordering of src and dst ptlocks
* because exclusive mmap_sem (or the i_mmap_lock) prevents deadlock.
*/
if (src_ptl != dst_ptl)
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);

pte = huge_ptep_get_and_clear(mm, old_addr, src_pte);
set_huge_pte_at(mm, new_addr, dst_pte, pte);

if (src_ptl != dst_ptl)
spin_unlock(src_ptl);
spin_unlock(dst_ptl);
}

int move_hugetlb_page_tables(struct vm_area_struct *vma,
struct vm_area_struct *new_vma,
unsigned long old_addr, unsigned long new_addr,
unsigned long len)
{
struct hstate *h = hstate_vma(vma);
struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long sz = huge_page_size(h);
struct mm_struct *mm = vma->vm_mm;
unsigned long old_end = old_addr + len;
unsigned long old_addr_copy;
pte_t *src_pte, *dst_pte;
struct mmu_notifier_range range;

mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, old_addr,
old_end);
adjust_range_if_pmd_sharing_possible(vma, &range.start, &range.end);
mmu_notifier_invalidate_range_start(&range);
/* Prevent race with file truncation */
i_mmap_lock_write(mapping);
for (; old_addr < old_end; old_addr += sz, new_addr += sz) {
src_pte = huge_pte_offset(mm, old_addr, sz);
if (!src_pte)
continue;
if (huge_pte_none(huge_ptep_get(src_pte)))
continue;

/* old_addr arg to huge_pmd_unshare() is a pointer and so the
* arg may be modified. Pass a copy instead to preserve the
* value in old_addr.
*/
old_addr_copy = old_addr;

if (huge_pmd_unshare(mm, vma, &old_addr_copy, src_pte))
continue;

dst_pte = huge_pte_alloc(mm, new_vma, new_addr, sz);
if (!dst_pte)
break;

move_huge_pte(vma, old_addr, new_addr, src_pte);
}
i_mmap_unlock_write(mapping);
flush_tlb_range(vma, old_end - len, old_end);
mmu_notifier_invalidate_range_end(&range);

return len + old_addr - old_end;
}

static void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long start, unsigned long end,
struct page *ref_page)
Expand Down Expand Up @@ -6257,12 +6362,6 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
* sharing is possible. For hugetlbfs, this prevents removal of any page
* table entries associated with the address space. This is important as we
* are setting up sharing based on existing page table entries (mappings).
*
* NOTE: This routine is only called from huge_pte_alloc. Some callers of
* huge_pte_alloc know that sharing is not possible and do not take
* i_mmap_rwsem as a performance optimization. This is handled by the
* if !vma_shareable check at the beginning of the routine. i_mmap_rwsem is
* only required for subsequent processing.
*/
pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pud_t *pud)
Expand Down
36 changes: 33 additions & 3 deletions mm/mremap.c
Original file line number Diff line number Diff line change
Expand Up @@ -489,6 +489,10 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);

if (is_vm_hugetlb_page(vma))
return move_hugetlb_page_tables(vma, new_vma, old_addr,
new_addr, len);

mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm,
old_addr, old_end);
mmu_notifier_invalidate_range_start(&range);
Expand Down Expand Up @@ -646,6 +650,10 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mremap_userfaultfd_prep(new_vma, uf);
}

if (is_vm_hugetlb_page(vma)) {
clear_vma_resv_huge_pages(vma);
}

/* Conceal VM_ACCOUNT so old reservation is not undone */
if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vma->vm_flags &= ~VM_ACCOUNT;
Expand Down Expand Up @@ -739,9 +747,6 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
(vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)))
return ERR_PTR(-EINVAL);

if (is_vm_hugetlb_page(vma))
return ERR_PTR(-EINVAL);

/* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr)
return ERR_PTR(-EFAULT);
Expand Down Expand Up @@ -937,6 +942,31 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,

if (mmap_write_lock_killable(current->mm))
return -EINTR;
vma = find_vma(mm, addr);
if (!vma || vma->vm_start > addr) {
ret = EFAULT;
goto out;
}

if (is_vm_hugetlb_page(vma)) {
struct hstate *h __maybe_unused = hstate_vma(vma);

old_len = ALIGN(old_len, huge_page_size(h));
new_len = ALIGN(new_len, huge_page_size(h));

/* addrs must be huge page aligned */
if (addr & ~huge_page_mask(h))
goto out;
if (new_addr & ~huge_page_mask(h))
goto out;

/*
* Don't allow remap expansion, because the underlying hugetlb
* reservation is not yet capable to handle split reservation.
*/
if (new_len > old_len)
goto out;
}

if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
ret = mremap_to(addr, old_len, new_addr, new_len,
Expand Down

0 comments on commit 550a7d6

Please sign in to comment.