Skip to content

Commit 889976d

Browse files
yinghantorvalds
authored andcommitted
memcg: reclaim memory from nodes in round-robin order
Presently, memory cgroup's direct reclaim frees memory from the current node. But this has some troubles. Usually when a set of threads works in a cooperative way, they tend to operate on the same node. So if they hit limits under memcg they will reclaim memory from themselves, damaging the active working set. For example, assume 2 node system which has Node 0 and Node 1 and a memcg which has 1G limit. After some work, file cache remains and the usages are Node 0: 1M Node 1: 998M. and run an application on Node 0, it will eat its foot before freeing unnecessary file caches. This patch adds round-robin for NUMA and adds equal pressure to each node. When using cpuset's spread memory feature, this will work very well. But yes, a better algorithm is needed. [akpm@linux-foundation.org: comment editing] [kamezawa.hiroyu@jp.fujitsu.com: fix time comparisons] Signed-off-by: Ying Han <yinghan@google.com> Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Cc: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent 4e4c941 commit 889976d

File tree

3 files changed

+106
-7
lines changed

3 files changed

+106
-7
lines changed

include/linux/memcontrol.h

+1
Original file line numberDiff line numberDiff line change
@@ -106,6 +106,7 @@ extern void mem_cgroup_end_migration(struct mem_cgroup *mem,
106106
*/
107107
int mem_cgroup_inactive_anon_is_low(struct mem_cgroup *memcg);
108108
int mem_cgroup_inactive_file_is_low(struct mem_cgroup *memcg);
109+
int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
109110
unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
110111
struct zone *zone,
111112
enum lru_list lru);

mm/memcontrol.c

+96-6
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,11 @@ struct mem_cgroup {
231231
* reclaimed from.
232232
*/
233233
int last_scanned_child;
234+
int last_scanned_node;
235+
#if MAX_NUMNODES > 1
236+
nodemask_t scan_nodes;
237+
unsigned long next_scan_node_update;
238+
#endif
234239
/*
235240
* Should the accounting and control be hierarchical, per subtree?
236241
*/
@@ -624,18 +629,27 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
624629
preempt_enable();
625630
}
626631

632+
static unsigned long
633+
mem_cgroup_get_zonestat_node(struct mem_cgroup *mem, int nid, enum lru_list idx)
634+
{
635+
struct mem_cgroup_per_zone *mz;
636+
u64 total = 0;
637+
int zid;
638+
639+
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
640+
mz = mem_cgroup_zoneinfo(mem, nid, zid);
641+
total += MEM_CGROUP_ZSTAT(mz, idx);
642+
}
643+
return total;
644+
}
627645
static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
628646
enum lru_list idx)
629647
{
630-
int nid, zid;
631-
struct mem_cgroup_per_zone *mz;
648+
int nid;
632649
u64 total = 0;
633650

634651
for_each_online_node(nid)
635-
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
636-
mz = mem_cgroup_zoneinfo(mem, nid, zid);
637-
total += MEM_CGROUP_ZSTAT(mz, idx);
638-
}
652+
total += mem_cgroup_get_zonestat_node(mem, nid, idx);
639653
return total;
640654
}
641655

@@ -1418,6 +1432,81 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem)
14181432
return ret;
14191433
}
14201434

1435+
#if MAX_NUMNODES > 1
1436+
1437+
/*
1438+
* Always updating the nodemask is not very good - even if we have an empty
1439+
* list or the wrong list here, we can start from some node and traverse all
1440+
* nodes based on the zonelist. So update the list loosely once per 10 secs.
1441+
*
1442+
*/
1443+
static void mem_cgroup_may_update_nodemask(struct mem_cgroup *mem)
1444+
{
1445+
int nid;
1446+
1447+
if (time_after(mem->next_scan_node_update, jiffies))
1448+
return;
1449+
1450+
mem->next_scan_node_update = jiffies + 10*HZ;
1451+
/* make a nodemask where this memcg uses memory from */
1452+
mem->scan_nodes = node_states[N_HIGH_MEMORY];
1453+
1454+
for_each_node_mask(nid, node_states[N_HIGH_MEMORY]) {
1455+
1456+
if (mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_FILE) ||
1457+
mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_FILE))
1458+
continue;
1459+
1460+
if (total_swap_pages &&
1461+
(mem_cgroup_get_zonestat_node(mem, nid, LRU_INACTIVE_ANON) ||
1462+
mem_cgroup_get_zonestat_node(mem, nid, LRU_ACTIVE_ANON)))
1463+
continue;
1464+
node_clear(nid, mem->scan_nodes);
1465+
}
1466+
}
1467+
1468+
/*
1469+
* Selecting a node where we start reclaim from. Because what we need is just
1470+
* reducing usage counter, start from anywhere is O,K. Considering
1471+
* memory reclaim from current node, there are pros. and cons.
1472+
*
1473+
* Freeing memory from current node means freeing memory from a node which
1474+
* we'll use or we've used. So, it may make LRU bad. And if several threads
1475+
* hit limits, it will see a contention on a node. But freeing from remote
1476+
* node means more costs for memory reclaim because of memory latency.
1477+
*
1478+
* Now, we use round-robin. Better algorithm is welcomed.
1479+
*/
1480+
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1481+
{
1482+
int node;
1483+
1484+
mem_cgroup_may_update_nodemask(mem);
1485+
node = mem->last_scanned_node;
1486+
1487+
node = next_node(node, mem->scan_nodes);
1488+
if (node == MAX_NUMNODES)
1489+
node = first_node(mem->scan_nodes);
1490+
/*
1491+
* We call this when we hit limit, not when pages are added to LRU.
1492+
* No LRU may hold pages because all pages are UNEVICTABLE or
1493+
* memcg is too small and all pages are not on LRU. In that case,
1494+
* we use curret node.
1495+
*/
1496+
if (unlikely(node == MAX_NUMNODES))
1497+
node = numa_node_id();
1498+
1499+
mem->last_scanned_node = node;
1500+
return node;
1501+
}
1502+
1503+
#else
1504+
int mem_cgroup_select_victim_node(struct mem_cgroup *mem)
1505+
{
1506+
return 0;
1507+
}
1508+
#endif
1509+
14211510
/*
14221511
* Scan the hierarchy if needed to reclaim memory. We remember the last child
14231512
* we reclaimed from, so that we don't end up penalizing one child extensively
@@ -4606,6 +4695,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
46064695
res_counter_init(&mem->memsw, NULL);
46074696
}
46084697
mem->last_scanned_child = 0;
4698+
mem->last_scanned_node = MAX_NUMNODES;
46094699
INIT_LIST_HEAD(&mem->oom_notify);
46104700

46114701
if (parent)

mm/vmscan.c

+9-1
Original file line numberDiff line numberDiff line change
@@ -2226,6 +2226,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
22262226
{
22272227
struct zonelist *zonelist;
22282228
unsigned long nr_reclaimed;
2229+
int nid;
22292230
struct scan_control sc = {
22302231
.may_writepage = !laptop_mode,
22312232
.may_unmap = 1,
@@ -2242,7 +2243,14 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
22422243
.gfp_mask = sc.gfp_mask,
22432244
};
22442245

2245-
zonelist = NODE_DATA(numa_node_id())->node_zonelists;
2246+
/*
2247+
* Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
2248+
* take care of from where we get pages. So the node where we start the
2249+
* scan does not need to be the current node.
2250+
*/
2251+
nid = mem_cgroup_select_victim_node(mem_cont);
2252+
2253+
zonelist = NODE_DATA(nid)->node_zonelists;
22462254

22472255
trace_mm_vmscan_memcg_reclaim_begin(0,
22482256
sc.may_writepage,

0 commit comments

Comments
 (0)