Skip to content

Commit

Permalink
mm, vmscan: move lru_lock to the node
Browse files Browse the repository at this point in the history
Node-based reclaim requires node-based LRUs and locking.  This is a
preparation patch that just moves the lru_lock to the node so later
patches are easier to review.  It is a mechanical change but note this
patch makes contention worse because the LRU lock is hotter and direct
reclaim and kswapd can contend on the same lock even when reclaiming
from different zones.

Link: http://lkml.kernel.org/r/1467970510-21195-3-git-send-email-mgorman@techsingularity.net
Signed-off-by: Mel Gorman <mgorman@techsingularity.net>
Reviewed-by: Minchan Kim <minchan@kernel.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Hillf Danton <hillf.zj@alibaba-inc.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Rik van Riel <riel@surriel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  • Loading branch information
gormanm authored and torvalds committed Jul 28, 2016
1 parent 75ef718 commit a52633d
Show file tree
Hide file tree
Showing 14 changed files with 75 additions and 69 deletions.
4 changes: 2 additions & 2 deletions Documentation/cgroup-v1/memcg_test.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,9 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.

8. LRU
Each memcg has its own private LRU. Now, its handling is under global
VM's control (means that it's handled under global zone->lru_lock).
VM's control (means that it's handled under global zone_lru_lock).
Almost all routines around memcg's LRU is called by global LRU's
list management functions under zone->lru_lock().
list management functions under zone_lru_lock().

A special function is mem_cgroup_isolate_pages(). This scans
memcg's private LRU and call __isolate_lru_page() to extract a page
Expand Down
4 changes: 2 additions & 2 deletions Documentation/cgroup-v1/memory.txt
Original file line number Diff line number Diff line change
Expand Up @@ -267,11 +267,11 @@ When oom event notifier is registered, event will be delivered.
Other lock order is following:
PG_locked.
mm->page_table_lock
zone->lru_lock
zone_lru_lock
lock_page_cgroup.
In many cases, just lock_page_cgroup() is called.
per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by
zone->lru_lock, it has no lock of its own.
zone_lru_lock, it has no lock of its own.

2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM)

Expand Down
2 changes: 1 addition & 1 deletion include/linux/mm_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ struct page {
*/
union {
struct list_head lru; /* Pageout list, eg. active_list
* protected by zone->lru_lock !
* protected by zone_lru_lock !
* Can be used as a generic list
* by the page owner.
*/
Expand Down
10 changes: 8 additions & 2 deletions include/linux/mmzone.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ struct free_area {
struct pglist_data;

/*
* zone->lock and zone->lru_lock are two of the hottest locks in the kernel.
* zone->lock and the zone lru_lock are two of the hottest locks in the kernel.
* So add a wild amount of padding here to ensure that they fall into separate
* cachelines. There are very few zone structures in the machine, so space
* consumption is not a concern here.
Expand Down Expand Up @@ -496,7 +496,6 @@ struct zone {
/* Write-intensive fields used by page reclaim */

/* Fields commonly accessed by the page reclaim scanner */
spinlock_t lru_lock;
struct lruvec lruvec;

/*
Expand Down Expand Up @@ -690,6 +689,9 @@ typedef struct pglist_data {
/* Number of pages migrated during the rate limiting time interval */
unsigned long numabalancing_migrate_nr_pages;
#endif
/* Write-intensive fields used by page reclaim */
ZONE_PADDING(_pad1_)
spinlock_t lru_lock;

#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/*
Expand Down Expand Up @@ -721,6 +723,10 @@ typedef struct pglist_data {

#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
#define node_end_pfn(nid) pgdat_end_pfn(NODE_DATA(nid))
static inline spinlock_t *zone_lru_lock(struct zone *zone)
{
return &zone->zone_pgdat->lru_lock;
}

static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat)
{
Expand Down
10 changes: 5 additions & 5 deletions mm/compaction.c
Original file line number Diff line number Diff line change
Expand Up @@ -752,7 +752,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* if contended.
*/
if (!(low_pfn % SWAP_CLUSTER_MAX)
&& compact_unlock_should_abort(&zone->lru_lock, flags,
&& compact_unlock_should_abort(zone_lru_lock(zone), flags,
&locked, cc))
break;

Expand Down Expand Up @@ -813,7 +813,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
spin_unlock_irqrestore(&zone->lru_lock,
spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
locked = false;
}
Expand All @@ -836,7 +836,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,

/* If we already hold the lock, we can skip some rechecking */
if (!locked) {
locked = compact_trylock_irqsave(&zone->lru_lock,
locked = compact_trylock_irqsave(zone_lru_lock(zone),
&flags, cc);
if (!locked)
break;
Expand Down Expand Up @@ -899,7 +899,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (nr_isolated) {
if (locked) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
locked = false;
}
acct_isolated(zone, cc);
Expand Down Expand Up @@ -927,7 +927,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
low_pfn = end_pfn;

if (locked)
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);

/*
* Update the pageblock-skip information and cached scanner pfn,
Expand Down
4 changes: 2 additions & 2 deletions mm/filemap.c
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->tree_lock (try_to_unmap_one)
* ->zone.lru_lock (follow_page->mark_page_accessed)
* ->zone.lru_lock (check_pte_range->isolate_lru_page)
* ->zone_lru_lock(zone) (follow_page->mark_page_accessed)
* ->zone_lru_lock(zone) (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->tree_lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
Expand Down
6 changes: 3 additions & 3 deletions mm/huge_memory.c
Original file line number Diff line number Diff line change
Expand Up @@ -1848,7 +1848,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
spin_unlock(&head->mapping->tree_lock);
}

spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);

unfreeze_page(head);

Expand Down Expand Up @@ -2034,7 +2034,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();

/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&page_zone(head)->lru_lock, flags);
spin_lock_irqsave(zone_lru_lock(page_zone(head)), flags);

if (mapping) {
void **pslot;
Expand Down Expand Up @@ -2077,7 +2077,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&pgdata->split_queue_lock);
fail: if (mapping)
spin_unlock(&mapping->tree_lock);
spin_unlock_irqrestore(&page_zone(head)->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(page_zone(head)), flags);
unfreeze_page(head);
ret = -EBUSY;
}
Expand Down
6 changes: 3 additions & 3 deletions mm/memcontrol.c
Original file line number Diff line number Diff line change
Expand Up @@ -2065,7 +2065,7 @@ static void lock_page_lru(struct page *page, int *isolated)
{
struct zone *zone = page_zone(page);

spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));
if (PageLRU(page)) {
struct lruvec *lruvec;

Expand All @@ -2089,7 +2089,7 @@ static void unlock_page_lru(struct page *page, int isolated)
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, page_lru(page));
}
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));
}

static void commit_charge(struct page *page, struct mem_cgroup *memcg,
Expand Down Expand Up @@ -2389,7 +2389,7 @@ void memcg_kmem_uncharge(struct page *page, int order)

/*
* Because tail pages are not marked as "used", set it. We're under
* zone->lru_lock and migration entries setup in all page mappings.
* zone_lru_lock and migration entries setup in all page mappings.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
Expand Down
10 changes: 5 additions & 5 deletions mm/mlock.c
Original file line number Diff line number Diff line change
Expand Up @@ -188,7 +188,7 @@ unsigned int munlock_vma_page(struct page *page)
* might otherwise copy PageMlocked to part of the tail pages before
* we clear it in the head page. It also stabilizes hpage_nr_pages().
*/
spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));

nr_pages = hpage_nr_pages(page);
if (!TestClearPageMlocked(page))
Expand All @@ -197,14 +197,14 @@ unsigned int munlock_vma_page(struct page *page)
__mod_zone_page_state(zone, NR_MLOCK, -nr_pages);

if (__munlock_isolate_lru_page(page, true)) {
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));
__munlock_isolated_page(page);
goto out;
}
__munlock_isolation_failed(page);

unlock_out:
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));

out:
return nr_pages - 1;
Expand Down Expand Up @@ -289,7 +289,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_init(&pvec_putback, 0);

/* Phase 1: page isolation */
spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];

Expand All @@ -315,7 +315,7 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
}
delta_munlocked = -nr + pagevec_count(&pvec_putback);
__mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));

/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
Expand Down
4 changes: 2 additions & 2 deletions mm/page_alloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -5904,6 +5904,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->kcompactd_wait);
#endif
pgdat_page_ext_init(pgdat);
spin_lock_init(&pgdat->lru_lock);

for (j = 0; j < MAX_NR_ZONES; j++) {
struct zone *zone = pgdat->node_zones + j;
Expand Down Expand Up @@ -5958,10 +5959,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat)
zone->min_slab_pages = (freesize * sysctl_min_slab_ratio) / 100;
#endif
zone->name = zone_names[j];
zone->zone_pgdat = pgdat;
spin_lock_init(&zone->lock);
spin_lock_init(&zone->lru_lock);
zone_seqlock_init(zone);
zone->zone_pgdat = pgdat;
zone_pcp_init(zone);

/* For bootup, initialized properly in watermark setup */
Expand Down
4 changes: 2 additions & 2 deletions mm/page_idle.c
Original file line number Diff line number Diff line change
Expand Up @@ -41,12 +41,12 @@ static struct page *page_idle_get_page(unsigned long pfn)
return NULL;

zone = page_zone(page);
spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));
return page;
}

Expand Down
2 changes: 1 addition & 1 deletion mm/rmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
* mapping->i_mmap_rwsem
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
* zone_lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
Expand Down
30 changes: 15 additions & 15 deletions mm/swap.c
Original file line number Diff line number Diff line change
Expand Up @@ -62,12 +62,12 @@ static void __page_cache_release(struct page *page)
struct lruvec *lruvec;
unsigned long flags;

spin_lock_irqsave(&zone->lru_lock, flags);
spin_lock_irqsave(zone_lru_lock(zone), flags);
lruvec = mem_cgroup_page_lruvec(page, zone);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
}
mem_cgroup_uncharge(page);
}
Expand Down Expand Up @@ -189,16 +189,16 @@ static void pagevec_lru_move_fn(struct pagevec *pvec,

if (pagezone != zone) {
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = pagezone;
spin_lock_irqsave(&zone->lru_lock, flags);
spin_lock_irqsave(zone_lru_lock(zone), flags);
}

lruvec = mem_cgroup_page_lruvec(page, zone);
(*move_fn)(page, lruvec, arg);
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
release_pages(pvec->pages, pvec->nr, pvec->cold);
pagevec_reinit(pvec);
}
Expand Down Expand Up @@ -318,9 +318,9 @@ void activate_page(struct page *page)
struct zone *zone = page_zone(page);

page = compound_head(page);
spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));
__activate_page(page, mem_cgroup_page_lruvec(page, zone), NULL);
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));
}
#endif

Expand Down Expand Up @@ -448,13 +448,13 @@ void add_page_to_unevictable_list(struct page *page)
struct zone *zone = page_zone(page);
struct lruvec *lruvec;

spin_lock_irq(&zone->lru_lock);
spin_lock_irq(zone_lru_lock(zone));
lruvec = mem_cgroup_page_lruvec(page, zone);
ClearPageActive(page);
SetPageUnevictable(page);
SetPageLRU(page);
add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE);
spin_unlock_irq(&zone->lru_lock);
spin_unlock_irq(zone_lru_lock(zone));
}

/**
Expand Down Expand Up @@ -744,7 +744,7 @@ void release_pages(struct page **pages, int nr, bool cold)
* same zone. The lock is held only if zone != NULL.
*/
if (zone && ++lock_batch == SWAP_CLUSTER_MAX) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = NULL;
}

Expand All @@ -759,7 +759,7 @@ void release_pages(struct page **pages, int nr, bool cold)

if (PageCompound(page)) {
if (zone) {
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);
zone = NULL;
}
__put_compound_page(page);
Expand All @@ -771,11 +771,11 @@ void release_pages(struct page **pages, int nr, bool cold)

if (pagezone != zone) {
if (zone)
spin_unlock_irqrestore(&zone->lru_lock,
spin_unlock_irqrestore(zone_lru_lock(zone),
flags);
lock_batch = 0;
zone = pagezone;
spin_lock_irqsave(&zone->lru_lock, flags);
spin_lock_irqsave(zone_lru_lock(zone), flags);
}

lruvec = mem_cgroup_page_lruvec(page, zone);
Expand All @@ -790,7 +790,7 @@ void release_pages(struct page **pages, int nr, bool cold)
list_add(&page->lru, &pages_to_free);
}
if (zone)
spin_unlock_irqrestore(&zone->lru_lock, flags);
spin_unlock_irqrestore(zone_lru_lock(zone), flags);

mem_cgroup_uncharge_list(&pages_to_free);
free_hot_cold_page_list(&pages_to_free, cold);
Expand Down Expand Up @@ -826,7 +826,7 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
VM_BUG_ON(NR_CPUS != 1 &&
!spin_is_locked(&lruvec_zone(lruvec)->lru_lock));
!spin_is_locked(zone_lru_lock(lruvec_zone(lruvec))));

if (!list)
SetPageLRU(page_tail);
Expand Down
Loading

0 comments on commit a52633d

Please sign in to comment.