Skip to content

Commit

Permalink
sched/numa: Skip some page migrations after a shared fault
Browse files Browse the repository at this point in the history
Shared faults can lead to lots of unnecessary page migrations,
slowing down the system, and causing private faults to hit the
per-pgdat migration ratelimit.

This patch adds sysctl numa_balancing_migrate_deferred, which specifies
how many shared page migrations to skip unconditionally, after each page
migration that is skipped because it is a shared fault.

This reduces the number of page migrations back and forth in
shared fault situations. It also gives a strong preference to
the tasks that are already running where most of the memory is,
and to moving the other tasks to near the memory.

Testing this with a much higher scan rate than the default
still seems to result in fewer page migrations than before.

Memory seems to be somewhat better consolidated than previously,
with multi-instance specjbb runs on a 4 node system.

Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
  • Loading branch information
Rik van Riel authored and Ingo Molnar committed Oct 9, 2013
1 parent 1e3646f commit de1c9ce
Show file tree
Hide file tree
Showing 5 changed files with 75 additions and 3 deletions.
10 changes: 9 additions & 1 deletion Documentation/sysctl/kernel.txt
Original file line number Diff line number Diff line change
Expand Up @@ -375,7 +375,8 @@ feature should be disabled. Otherwise, if the system overhead from the
feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.
numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
numa_balancing_migrate_deferred.

==============================================================

Expand Down Expand Up @@ -421,6 +422,13 @@ the schedule balancer stops pushing the task towards a preferred node. This
gives the scheduler a chance to place the task on an alternative node if the
preferred node is overloaded.

numa_balancing_migrate_deferred is how many page migrations get skipped
unconditionally, after a page migration is skipped because a page is shared
with other tasks. This reduces page migration overhead, and determines
how much stronger the "move task near its memory" policy scheduler becomes,
versus the "move memory near its task" memory management policy, for workloads
with shared memory.

==============================================================

osrelease, ostype & version:
Expand Down
5 changes: 4 additions & 1 deletion include/linux/sched.h
Original file line number Diff line number Diff line change
Expand Up @@ -1342,6 +1342,8 @@ struct task_struct {
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
int numa_preferred_nid;
int numa_migrate_deferred;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
Expand Down Expand Up @@ -1372,7 +1374,6 @@ struct task_struct {
*/
unsigned long numa_faults_locality[2];

int numa_preferred_nid;
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */

Expand Down Expand Up @@ -1469,6 +1470,8 @@ extern void task_numa_fault(int last_node, int node, int pages, int flags);
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p);

extern unsigned int sysctl_numa_balancing_migrate_deferred;
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
Expand Down
8 changes: 8 additions & 0 deletions kernel/sched/fair.c
Original file line number Diff line number Diff line change
Expand Up @@ -833,6 +833,14 @@ unsigned int sysctl_numa_balancing_scan_size = 256;
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;

/*
* After skipping a page migration on a shared page, skip N more numa page
* migrations unconditionally. This reduces the number of NUMA migrations
* in shared memory workloads, and has the effect of pulling tasks towards
* where their memory lives, over pulling the memory towards the task.
*/
unsigned int sysctl_numa_balancing_migrate_deferred = 16;

static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
Expand Down
7 changes: 7 additions & 0 deletions kernel/sysctl.c
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
{
.procname = "numa_balancing_migrate_deferred",
.data = &sysctl_numa_balancing_migrate_deferred,
.maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
Expand Down
48 changes: 47 additions & 1 deletion mm/mempolicy.c
Original file line number Diff line number Diff line change
Expand Up @@ -2301,6 +2301,35 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}

#ifdef CONFIG_NUMA_BALANCING
static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
{
/* Never defer a private fault */
if (cpupid_match_pid(p, last_cpupid))
return false;

if (p->numa_migrate_deferred) {
p->numa_migrate_deferred--;
return true;
}
return false;
}

static inline void defer_numa_migrate(struct task_struct *p)
{
p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
}
#else
static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
{
return false;
}

static inline void defer_numa_migrate(struct task_struct *p)
{
}
#endif /* CONFIG_NUMA_BALANCING */

/**
* mpol_misplaced - check whether current page node is valid in policy
*
Expand Down Expand Up @@ -2402,7 +2431,24 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* relation.
*/
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)
if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {

/* See sysctl_numa_balancing_migrate_deferred comment */
if (!cpupid_match_pid(current, last_cpupid))
defer_numa_migrate(current);

goto out;
}

/*
* The quadratic filter above reduces extraneous migration
* of shared pages somewhat. This code reduces it even more,
* reducing the overhead of page migrations of shared pages.
* This makes workloads with shared pages rely more on
* "move task near its memory", and less on "move memory
* towards its task", which is exactly what we want.
*/
if (numa_migrate_deferred(current, last_cpupid))
goto out;
}

Expand Down

0 comments on commit de1c9ce

Please sign in to comment.