Skip to content

Commit

Permalink
[PATCH] add page_mkwrite() vm_operations method
Browse files Browse the repository at this point in the history
Add a new VMA operation to notify a filesystem or other driver about the
MMU generating a fault because userspace attempted to write to a page
mapped through a read-only PTE.

This facility permits the filesystem or driver to:

 (*) Implement storage allocation/reservation on attempted write, and so to
     deal with problems such as ENOSPC more gracefully (perhaps by generating
     SIGBUS).

 (*) Delay making the page writable until the contents have been written to a
     backing cache. This is useful for NFS/AFS when using FS-Cache/CacheFS.
     It permits the filesystem to have some guarantee about the state of the
     cache.

 (*) Account and limit number of dirty pages. This is one piece of the puzzle
     needed to make shared writable mapping work safely in FUSE.

Needed by cachefs (Or is it cachefiles?  Or fscache? <head spins>).

At least four other groups have stated an interest in it or a desire to use
the functionality it provides: FUSE, OCFS2, NTFS and JFFS2.  Also, things like
EXT3 really ought to use it to deal with the case of shared-writable mmap
encountering ENOSPC before we permit the page to be dirtied.

From: Peter Zijlstra <a.p.zijlstra@chello.nl>

  get_user_pages(.write=1, .force=1) can generate COW hits on read-only
  shared mappings, this patch traps those as mkpage_write candidates and fails
  to handle them the old way.

Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Miklos Szeredi <miklos@szeredi.hu>
Cc: Joel Becker <Joel.Becker@oracle.com>
Cc: Mark Fasheh <mark.fasheh@oracle.com>
Cc: Anton Altaparmakov <aia21@cantab.net>
Cc: David Woodhouse <dwmw2@infradead.org>
Cc: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
dhowells authored and Linus Torvalds committed Jun 23, 2006
1 parent bd96b9e commit 9637a5e
Show file tree
Hide file tree
Showing 4 changed files with 99 additions and 28 deletions.
4 changes: 4 additions & 0 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@ struct vm_operations_struct {
void (*close)(struct vm_area_struct * area);
struct page * (*nopage)(struct vm_area_struct * area, unsigned long address, int *type);
int (*populate)(struct vm_area_struct * area, unsigned long address, unsigned long len, pgprot_t prot, unsigned long pgoff, int nonblock);

/* notification that a previously read-only page is about to become
* writable, if an error is returned it will cause a SIGBUS */
int (*page_mkwrite)(struct vm_area_struct *vma, struct page *page);
#ifdef CONFIG_NUMA
int (*set_policy)(struct vm_area_struct *vma, struct mempolicy *new);
struct mempolicy *(*get_policy)(struct vm_area_struct *vma,
Expand Down
100 changes: 76 additions & 24 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1457,25 +1457,60 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
{
struct page *old_page, *new_page;
pte_t entry;
int ret = VM_FAULT_MINOR;
int reuse, ret = VM_FAULT_MINOR;

old_page = vm_normal_page(vma, address, orig_pte);
if (!old_page)
goto gotten;

if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
int reuse = can_share_swap_page(old_page);
unlock_page(old_page);
if (reuse) {
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
ret |= VM_FAULT_WRITE;
goto unlock;
if (unlikely((vma->vm_flags & (VM_SHARED|VM_WRITE)) ==
(VM_SHARED|VM_WRITE))) {
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
/*
* Notify the address space that the page is about to
* become writable so that it can prohibit this or wait
* for the page to get into an appropriate state.
*
* We do this without the lock held, so that it can
* sleep if it needs to.
*/
page_cache_get(old_page);
pte_unmap_unlock(page_table, ptl);

if (vma->vm_ops->page_mkwrite(vma, old_page) < 0)
goto unwritable_page;

page_cache_release(old_page);

/*
* Since we dropped the lock we need to revalidate
* the PTE as someone else may have changed it. If
* they did, we just return, as we can count on the
* MMU to tell us if they didn't also make it writable.
*/
page_table = pte_offset_map_lock(mm, pmd, address,
&ptl);
if (!pte_same(*page_table, orig_pte))
goto unlock;
}

reuse = 1;
} else if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
reuse = can_share_swap_page(old_page);
unlock_page(old_page);
} else {
reuse = 0;
}

if (reuse) {
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
ptep_set_access_flags(vma, address, page_table, entry, 1);
update_mmu_cache(vma, address, entry);
lazy_mmu_prot_update(entry);
ret |= VM_FAULT_WRITE;
goto unlock;
}

/*
Expand Down Expand Up @@ -1535,6 +1570,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (old_page)
page_cache_release(old_page);
return VM_FAULT_OOM;

unwritable_page:
page_cache_release(old_page);
return VM_FAULT_SIGBUS;
}

/*
Expand Down Expand Up @@ -2083,18 +2122,31 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Should we do an early C-O-W break?
*/
if (write_access && !(vma->vm_flags & VM_SHARED)) {
struct page *page;
if (write_access) {
if (!(vma->vm_flags & VM_SHARED)) {
struct page *page;

if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
new_page = page;
anon = 1;
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_page_vma(GFP_HIGHUSER, vma, address);
if (!page)
goto oom;
copy_user_highpage(page, new_page, address);
page_cache_release(new_page);
new_page = page;
anon = 1;

} else {
/* if the page will be shareable, see if the backing
* address space wants to know that the page is about
* to become writable */
if (vma->vm_ops->page_mkwrite &&
vma->vm_ops->page_mkwrite(vma, new_page) < 0
) {
page_cache_release(new_page);
return VM_FAULT_SIGBUS;
}
}
}

page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
Expand Down
12 changes: 10 additions & 2 deletions mm/mmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -1065,7 +1065,8 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
vma->vm_start = addr;
vma->vm_end = addr + len;
vma->vm_flags = vm_flags;
vma->vm_page_prot = protection_map[vm_flags & 0x0f];
vma->vm_page_prot = protection_map[vm_flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma->vm_pgoff = pgoff;

if (file) {
Expand All @@ -1089,6 +1090,12 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr,
goto free_vma;
}

/* Don't make the VMA automatically writable if it's shared, but the
* backer wishes to know when pages are first written to */
if (vma->vm_ops && vma->vm_ops->page_mkwrite)
vma->vm_page_prot =
protection_map[vm_flags & (VM_READ|VM_WRITE|VM_EXEC)];

/* We set VM_ACCOUNT in a shared mapping's vm_flags, to inform
* shmem_zero_setup (perhaps called through /dev/zero's ->mmap)
* that memory reservation must be checked; but that reservation
Expand Down Expand Up @@ -1921,7 +1928,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len)
vma->vm_end = addr + len;
vma->vm_pgoff = pgoff;
vma->vm_flags = flags;
vma->vm_page_prot = protection_map[flags & 0x0f];
vma->vm_page_prot = protection_map[flags &
(VM_READ|VM_WRITE|VM_EXEC|VM_SHARED)];
vma_link(mm, vma, prev, rb_link, rb_parent);
out:
mm->total_vm += len >> PAGE_SHIFT;
Expand Down
11 changes: 9 additions & 2 deletions mm/mprotect.c
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
unsigned long oldflags = vma->vm_flags;
long nrpages = (end - start) >> PAGE_SHIFT;
unsigned long charged = 0;
unsigned int mask;
pgprot_t newprot;
pgoff_t pgoff;
int error;
Expand All @@ -149,8 +150,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}
}

newprot = protection_map[newflags & 0xf];

/*
* First try to merge with previous and/or next vma.
*/
Expand All @@ -177,6 +176,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
}

success:
/* Don't make the VMA automatically writable if it's shared, but the
* backer wishes to know when pages are first written to */
mask = VM_READ|VM_WRITE|VM_EXEC|VM_SHARED;
if (vma->vm_ops && vma->vm_ops->page_mkwrite)
mask &= ~VM_SHARED;

newprot = protection_map[newflags & mask];

/*
* vm_flags and vm_page_prot are protected by the mmap_sem
* held in write mode.
Expand Down

0 comments on commit 9637a5e

Please sign in to comment.