Skip to content

Commit

Permalink
[PATCH] mm: split page table lock
Browse files Browse the repository at this point in the history
Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
  • Loading branch information
Hugh Dickins authored and Linus Torvalds committed Oct 30, 2005
1 parent b38c684 commit 4c21e2f
Show file tree
Hide file tree
Showing 23 changed files with 138 additions and 79 deletions.
1 change: 1 addition & 0 deletions arch/arm/mm/mm-armv.c
Original file line number Diff line number Diff line change
Expand Up @@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
pte = pmd_page(*pmd);
pmd_clear(pmd);
dec_page_state(nr_page_table_pages);
pte_lock_deinit(pte);
pte_free(pte);
pmd_free(pmd);
free:
Expand Down
4 changes: 2 additions & 2 deletions arch/frv/mm/pgalloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
if (pgd_list)
pgd_list->private = (unsigned long) &page->index;
pgd_list = page;
page->private = (unsigned long) &pgd_list;
set_page_private(page, (unsigned long)&pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
struct page *next, **pprev, *page = virt_to_page(pgd);
next = (struct page *) page->index;
pprev = (struct page **) page->private;
pprev = (struct page **)page_private(page);
*pprev = next;
if (next)
next->private = (unsigned long) pprev;
Expand Down
8 changes: 4 additions & 4 deletions arch/i386/mm/pgtable.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
struct page *page = virt_to_page(pgd);
page->index = (unsigned long)pgd_list;
if (pgd_list)
pgd_list->private = (unsigned long)&page->index;
set_page_private(pgd_list, (unsigned long)&page->index);
pgd_list = page;
page->private = (unsigned long)&pgd_list;
set_page_private(page, (unsigned long)&pgd_list);
}

static inline void pgd_list_del(pgd_t *pgd)
{
struct page *next, **pprev, *page = virt_to_page(pgd);
next = (struct page *)page->index;
pprev = (struct page **)page->private;
pprev = (struct page **)page_private(page);
*pprev = next;
if (next)
next->private = (unsigned long)pprev;
set_page_private(next, (unsigned long)pprev);
}

void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
Expand Down
1 change: 1 addition & 0 deletions arch/um/kernel/skas/mmu.c
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)

if(!proc_mm || !ptrace_faultinfo){
free_page(mmu->id.stack);
pte_lock_deinit(virt_to_page(mmu->last_page_table));
pte_free_kernel((pte_t *) mmu->last_page_table);
dec_page_state(nr_page_table_pages);
#ifdef CONFIG_3_LEVEL_PGTABLES
Expand Down
4 changes: 2 additions & 2 deletions fs/afs/file.c
Original file line number Diff line number Diff line change
Expand Up @@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
cachefs_uncache_page(vnode->cache, page);
#endif

pageio = (struct cachefs_page *) page->private;
page->private = 0;
pageio = (struct cachefs_page *) page_private(page);
set_page_private(page, 0);
ClearPagePrivate(page);

if (pageio)
Expand Down
2 changes: 1 addition & 1 deletion fs/buffer.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ static void
__clear_page_buffers(struct page *page)
{
ClearPagePrivate(page);
page->private = 0;
set_page_private(page, 0);
page_cache_release(page);
}

Expand Down
12 changes: 6 additions & 6 deletions fs/jfs/jfs_metapage.c
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ struct meta_anchor {
atomic_t io_count;
struct metapage *mp[MPS_PER_PAGE];
};
#define mp_anchor(page) ((struct meta_anchor *)page->private)
#define mp_anchor(page) ((struct meta_anchor *)page_private(page))

static inline struct metapage *page_to_mp(struct page *page, uint offset)
{
Expand All @@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
if (!a)
return -ENOMEM;
memset(a, 0, sizeof(struct meta_anchor));
page->private = (unsigned long)a;
set_page_private(page, (unsigned long)a);
SetPagePrivate(page);
kmap(page);
}
Expand Down Expand Up @@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
a->mp[index] = NULL;
if (--a->mp_count == 0) {
kfree(a);
page->private = 0;
set_page_private(page, 0);
ClearPagePrivate(page);
kunmap(page);
}
Expand All @@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
#else
static inline struct metapage *page_to_mp(struct page *page, uint offset)
{
return PagePrivate(page) ? (struct metapage *)page->private : NULL;
return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
}

static inline int insert_metapage(struct page *page, struct metapage *mp)
{
if (mp) {
page->private = (unsigned long)mp;
set_page_private(page, (unsigned long)mp);
SetPagePrivate(page);
kmap(page);
}
Expand All @@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)

static inline void remove_metapage(struct page *page, struct metapage *mp)
{
page->private = 0;
set_page_private(page, 0);
ClearPagePrivate(page);
kunmap(page);
}
Expand Down
7 changes: 4 additions & 3 deletions fs/xfs/linux-2.6/xfs_buf.c
Original file line number Diff line number Diff line change
Expand Up @@ -181,8 +181,9 @@ set_page_region(
size_t offset,
size_t length)
{
page->private |= page_region_mask(offset, length);
if (page->private == ~0UL)
set_page_private(page,
page_private(page) | page_region_mask(offset, length));
if (page_private(page) == ~0UL)
SetPageUptodate(page);
}

Expand All @@ -194,7 +195,7 @@ test_page_region(
{
unsigned long mask = page_region_mask(offset, length);

return (mask && (page->private & mask) == mask);
return (mask && (page_private(page) & mask) == mask);
}

/*
Expand Down
6 changes: 3 additions & 3 deletions include/linux/buffer_head.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
/* If we *know* page->private refers to buffer_heads */
#define page_buffers(page) \
({ \
BUG_ON(!PagePrivate(page)); \
((struct buffer_head *)(page)->private); \
BUG_ON(!PagePrivate(page)); \
((struct buffer_head *)page_private(page)); \
})
#define page_has_buffers(page) PagePrivate(page)

Expand Down Expand Up @@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
{
page_cache_get(page);
SetPagePrivate(page);
page->private = (unsigned long)head;
set_page_private(page, (unsigned long)head);
}

static inline void get_bh(struct buffer_head *bh)
Expand Down
46 changes: 38 additions & 8 deletions include/linux/mm.h
Original file line number Diff line number Diff line change
Expand Up @@ -226,13 +226,18 @@ struct page {
* to show when page is mapped
* & limit reverse map searches.
*/
unsigned long private; /* Mapping-private opaque data:
union {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
* if PagePrivate set; used for
* swp_entry_t if PageSwapCache
* When page is free, this indicates
* order in the buddy system.
*/
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
spinlock_t ptl;
#endif
} u;
struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL.
* If page mapped as anonymous
Expand Down Expand Up @@ -260,6 +265,9 @@ struct page {
#endif /* WANT_PAGE_VIRTUAL */
};

#define page_private(page) ((page)->u.private)
#define set_page_private(page, v) ((page)->u.private = (v))

/*
* FIXME: take this include out, include page-flags.h in
* files which need it (119 of them)
Expand Down Expand Up @@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));

#ifdef CONFIG_HUGETLB_PAGE

static inline int page_count(struct page *p)
static inline int page_count(struct page *page)
{
if (PageCompound(p))
p = (struct page *)p->private;
return atomic_read(&(p)->_count) + 1;
if (PageCompound(page))
page = (struct page *)page_private(page);
return atomic_read(&page->_count) + 1;
}

static inline void get_page(struct page *page)
{
if (unlikely(PageCompound(page)))
page = (struct page *)page->private;
page = (struct page *)page_private(page);
atomic_inc(&page->_count);
}

Expand Down Expand Up @@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
static inline pgoff_t page_index(struct page *page)
{
if (unlikely(PageSwapCache(page)))
return page->private;
return page_private(page);
return page->index;
}

Expand Down Expand Up @@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
/*
* We tuck a spinlock to guard each pagetable page into its struct page,
* at page->private, with BUILD_BUG_ON to make sure that this will not
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
* When freeing, reset page->mapping so free_pages_check won't complain.
*/
#define __pte_lockptr(page) &((page)->u.ptl)
#define pte_lock_init(_page) do { \
spin_lock_init(__pte_lockptr(_page)); \
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
#else
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */

#define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \
spinlock_t *__ptl = &(mm)->page_table_lock; \
spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \
spin_lock(__ptl); \
Expand Down
4 changes: 2 additions & 2 deletions kernel/kexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
if (pages) {
unsigned int count, i;
pages->mapping = NULL;
pages->private = order;
set_page_private(pages, order);
count = 1 << order;
for (i = 0; i < count; i++)
SetPageReserved(pages + i);
Expand All @@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
{
unsigned int order, count, i;

order = page->private;
order = page_private(page);
count = 1 << order;
for (i = 0; i < count; i++)
ClearPageReserved(page + i);
Expand Down
13 changes: 13 additions & 0 deletions mm/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME
def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC

# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
#
config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
default "4"
2 changes: 1 addition & 1 deletion mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn
* ignore it for all cases but swap, where only page->private is
* ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns.
* -- wli
Expand Down
24 changes: 14 additions & 10 deletions mm/memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
struct page *page = pmd_page(*pmd);
pmd_clear(pmd);
pte_lock_deinit(page);
pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--;
Expand Down Expand Up @@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
if (!new)
return -ENOMEM;

pte_lock_init(new);
spin_lock(&mm->page_table_lock);
if (pmd_present(*pmd)) /* Another has populated it */
if (pmd_present(*pmd)) { /* Another has populated it */
pte_lock_deinit(new);
pte_free(new);
else {
} else {
mm->nr_ptes++;
inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new);
Expand Down Expand Up @@ -432,7 +435,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!dst_pte)
return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = &src_mm->page_table_lock;
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl);

do {
Expand Down Expand Up @@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
* (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm,
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte)
{
int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) {
spin_lock(&mm->page_table_lock);
spinlock_t *ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
same = pte_same(*page_table, orig_pte);
spin_unlock(&mm->page_table_lock);
spin_unlock(ptl);
}
#endif
pte_unmap(page_table);
Expand Down Expand Up @@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte;
int ret = VM_FAULT_MINOR;

if (!pte_unmap_same(mm, page_table, orig_pte))
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out;

entry = pte_to_swp_entry(orig_pte);
Expand Down Expand Up @@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot);

ptl = &mm->page_table_lock;
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (!pte_none(*page_table))
goto release;
Expand Down Expand Up @@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff;
int err;

if (!pte_unmap_same(mm, page_table, orig_pte))
if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return VM_FAULT_MINOR;

if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
Expand Down Expand Up @@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
pte, pmd, write_access, entry);
}

ptl = &mm->page_table_lock;
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry)))
goto unlock;
Expand Down
Loading

0 comments on commit 4c21e2f

Please sign in to comment.