Skip to content

Commit 7e1c0d6

Browse files
shakeelbtorvalds
authored andcommitted
memcg: switch lruvec stats to rstat
The commit 2d146aa ("mm: memcontrol: switch to rstat") switched memcg stats to rstat infrastructure but skipped the conversion of the lruvec stats as such stats are read in the performance critical code paths and flushing stats may have impacted the performances of the applications. This patch converts the lruvec stats to rstat and later patches add mechanisms to keep the performance impact to minimum. The rstat conversion comes with the price i.e. memory cost. Effectively this patch reverts the savings done by the commit f3344ad ("mm: memcontrol: optimize per-lruvec stats counter memory usage"). However this cost is justified due to negative impact of the inaccurate lruvec stats on many heuristics. One such case is reported in [1]. The memory reclaim code is filled with plethora of heuristics and many of those heuristics reads the lruvec stats. So, inaccurate stats can make such heuristics ineffective. [1] reports the impact of inaccurate lruvec stats on the "cache trim mode" heuristic. Inaccurate lruvec stats can impact the deactivation and aging anon heuristics as well. [1] https://lore.kernel.org/linux-mm/20210311004449.1170308-1-ying.huang@intel.com/ Link: https://lkml.kernel.org/r/20210716212137.1391164-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210714013948.270662-1-shakeelb@google.com Signed-off-by: Shakeel Butt <shakeelb@google.com> Cc: Tejun Heo <tj@kernel.org> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Muchun Song <songmuchun@bytedance.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Roman Gushchin <guro@fb.com> Cc: Huang Ying <ying.huang@intel.com> Cc: Hillf Danton <hdanton@sina.com> Cc: Michal Koutný <mkoutny@suse.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
1 parent fab827d commit 7e1c0d6

File tree

2 files changed

+58
-98
lines changed

2 files changed

+58
-98
lines changed

include/linux/memcontrol.h

Lines changed: 20 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -105,14 +105,6 @@ struct mem_cgroup_reclaim_iter {
105105
unsigned int generation;
106106
};
107107

108-
struct lruvec_stat {
109-
long count[NR_VM_NODE_STAT_ITEMS];
110-
};
111-
112-
struct batched_lruvec_stat {
113-
s32 count[NR_VM_NODE_STAT_ITEMS];
114-
};
115-
116108
/*
117109
* Bitmap and deferred work of shrinker::id corresponding to memcg-aware
118110
* shrinkers, which have elements charged to this memcg.
@@ -123,24 +115,30 @@ struct shrinker_info {
123115
unsigned long *map;
124116
};
125117

118+
struct lruvec_stats_percpu {
119+
/* Local (CPU and cgroup) state */
120+
long state[NR_VM_NODE_STAT_ITEMS];
121+
122+
/* Delta calculation for lockless upward propagation */
123+
long state_prev[NR_VM_NODE_STAT_ITEMS];
124+
};
125+
126+
struct lruvec_stats {
127+
/* Aggregated (CPU and subtree) state */
128+
long state[NR_VM_NODE_STAT_ITEMS];
129+
130+
/* Pending child counts during tree propagation */
131+
long state_pending[NR_VM_NODE_STAT_ITEMS];
132+
};
133+
126134
/*
127135
* per-node information in memory controller.
128136
*/
129137
struct mem_cgroup_per_node {
130138
struct lruvec lruvec;
131139

132-
/*
133-
* Legacy local VM stats. This should be struct lruvec_stat and
134-
* cannot be optimized to struct batched_lruvec_stat. Because
135-
* the threshold of the lruvec_stat_cpu can be as big as
136-
* MEMCG_CHARGE_BATCH * PAGE_SIZE. It can fit into s32. But this
137-
* filed has no upper limit.
138-
*/
139-
struct lruvec_stat __percpu *lruvec_stat_local;
140-
141-
/* Subtree VM stats (batched updates) */
142-
struct batched_lruvec_stat __percpu *lruvec_stat_cpu;
143-
atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS];
140+
struct lruvec_stats_percpu __percpu *lruvec_stats_percpu;
141+
struct lruvec_stats lruvec_stats;
144142

145143
unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS];
146144

@@ -997,7 +995,7 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec,
997995
return node_page_state(lruvec_pgdat(lruvec), idx);
998996

999997
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1000-
x = atomic_long_read(&pn->lruvec_stat[idx]);
998+
x = READ_ONCE(pn->lruvec_stats.state[idx]);
1001999
#ifdef CONFIG_SMP
10021000
if (x < 0)
10031001
x = 0;
@@ -1017,7 +1015,7 @@ static inline unsigned long lruvec_page_state_local(struct lruvec *lruvec,
10171015

10181016
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
10191017
for_each_possible_cpu(cpu)
1020-
x += per_cpu(pn->lruvec_stat_local->count[idx], cpu);
1018+
x += per_cpu(pn->lruvec_stats_percpu->state[idx], cpu);
10211019
#ifdef CONFIG_SMP
10221020
if (x < 0)
10231021
x = 0;

mm/memcontrol.c

Lines changed: 38 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -660,23 +660,11 @@ static unsigned long memcg_page_state_local(struct mem_cgroup *memcg, int idx)
660660
return x;
661661
}
662662

663-
static struct mem_cgroup_per_node *
664-
parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
665-
{
666-
struct mem_cgroup *parent;
667-
668-
parent = parent_mem_cgroup(pn->memcg);
669-
if (!parent)
670-
return NULL;
671-
return parent->nodeinfo[nid];
672-
}
673-
674663
void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
675664
int val)
676665
{
677666
struct mem_cgroup_per_node *pn;
678667
struct mem_cgroup *memcg;
679-
long x, threshold = MEMCG_CHARGE_BATCH;
680668

681669
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
682670
memcg = pn->memcg;
@@ -685,21 +673,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
685673
__mod_memcg_state(memcg, idx, val);
686674

687675
/* Update lruvec */
688-
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
689-
690-
if (vmstat_item_in_bytes(idx))
691-
threshold <<= PAGE_SHIFT;
692-
693-
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
694-
if (unlikely(abs(x) > threshold)) {
695-
pg_data_t *pgdat = lruvec_pgdat(lruvec);
696-
struct mem_cgroup_per_node *pi;
697-
698-
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
699-
atomic_long_add(x, &pi->lruvec_stat[idx]);
700-
x = 0;
701-
}
702-
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
676+
__this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
703677
}
704678

705679
/**
@@ -2278,40 +2252,13 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
22782252
mutex_unlock(&percpu_charge_mutex);
22792253
}
22802254

2281-
static void memcg_flush_lruvec_page_state(struct mem_cgroup *memcg, int cpu)
2282-
{
2283-
int nid;
2284-
2285-
for_each_node(nid) {
2286-
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
2287-
unsigned long stat[NR_VM_NODE_STAT_ITEMS];
2288-
struct batched_lruvec_stat *lstatc;
2289-
int i;
2290-
2291-
lstatc = per_cpu_ptr(pn->lruvec_stat_cpu, cpu);
2292-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
2293-
stat[i] = lstatc->count[i];
2294-
lstatc->count[i] = 0;
2295-
}
2296-
2297-
do {
2298-
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
2299-
atomic_long_add(stat[i], &pn->lruvec_stat[i]);
2300-
} while ((pn = parent_nodeinfo(pn, nid)));
2301-
}
2302-
}
2303-
23042255
static int memcg_hotplug_cpu_dead(unsigned int cpu)
23052256
{
23062257
struct memcg_stock_pcp *stock;
2307-
struct mem_cgroup *memcg;
23082258

23092259
stock = &per_cpu(memcg_stock, cpu);
23102260
drain_stock(stock);
23112261

2312-
for_each_mem_cgroup(memcg)
2313-
memcg_flush_lruvec_page_state(memcg, cpu);
2314-
23152262
return 0;
23162263
}
23172264

@@ -5118,17 +5065,9 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
51185065
if (!pn)
51195066
return 1;
51205067

5121-
pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
5122-
GFP_KERNEL_ACCOUNT);
5123-
if (!pn->lruvec_stat_local) {
5124-
kfree(pn);
5125-
return 1;
5126-
}
5127-
5128-
pn->lruvec_stat_cpu = alloc_percpu_gfp(struct batched_lruvec_stat,
5129-
GFP_KERNEL_ACCOUNT);
5130-
if (!pn->lruvec_stat_cpu) {
5131-
free_percpu(pn->lruvec_stat_local);
5068+
pn->lruvec_stats_percpu = alloc_percpu_gfp(struct lruvec_stats_percpu,
5069+
GFP_KERNEL_ACCOUNT);
5070+
if (!pn->lruvec_stats_percpu) {
51325071
kfree(pn);
51335072
return 1;
51345073
}
@@ -5149,8 +5088,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
51495088
if (!pn)
51505089
return;
51515090

5152-
free_percpu(pn->lruvec_stat_cpu);
5153-
free_percpu(pn->lruvec_stat_local);
5091+
free_percpu(pn->lruvec_stats_percpu);
51545092
kfree(pn);
51555093
}
51565094

@@ -5166,15 +5104,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
51665104

51675105
static void mem_cgroup_free(struct mem_cgroup *memcg)
51685106
{
5169-
int cpu;
5170-
51715107
memcg_wb_domain_exit(memcg);
5172-
/*
5173-
* Flush percpu lruvec stats to guarantee the value
5174-
* correctness on parent's and all ancestor levels.
5175-
*/
5176-
for_each_online_cpu(cpu)
5177-
memcg_flush_lruvec_page_state(memcg, cpu);
51785108
__mem_cgroup_free(memcg);
51795109
}
51805110

@@ -5407,7 +5337,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
54075337
struct mem_cgroup *parent = parent_mem_cgroup(memcg);
54085338
struct memcg_vmstats_percpu *statc;
54095339
long delta, v;
5410-
int i;
5340+
int i, nid;
54115341

54125342
statc = per_cpu_ptr(memcg->vmstats_percpu, cpu);
54135343

@@ -5455,6 +5385,36 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
54555385
if (parent)
54565386
parent->vmstats.events_pending[i] += delta;
54575387
}
5388+
5389+
for_each_node_state(nid, N_MEMORY) {
5390+
struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid];
5391+
struct mem_cgroup_per_node *ppn = NULL;
5392+
struct lruvec_stats_percpu *lstatc;
5393+
5394+
if (parent)
5395+
ppn = parent->nodeinfo[nid];
5396+
5397+
lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu);
5398+
5399+
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
5400+
delta = pn->lruvec_stats.state_pending[i];
5401+
if (delta)
5402+
pn->lruvec_stats.state_pending[i] = 0;
5403+
5404+
v = READ_ONCE(lstatc->state[i]);
5405+
if (v != lstatc->state_prev[i]) {
5406+
delta += v - lstatc->state_prev[i];
5407+
lstatc->state_prev[i] = v;
5408+
}
5409+
5410+
if (!delta)
5411+
continue;
5412+
5413+
pn->lruvec_stats.state[i] += delta;
5414+
if (ppn)
5415+
ppn->lruvec_stats.state_pending[i] += delta;
5416+
}
5417+
}
54585418
}
54595419

54605420
#ifdef CONFIG_MMU
@@ -6388,6 +6348,8 @@ static int memory_numa_stat_show(struct seq_file *m, void *v)
63886348
int i;
63896349
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
63906350

6351+
cgroup_rstat_flush(memcg->css.cgroup);
6352+
63916353
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
63926354
int nid;
63936355

0 commit comments

Comments
 (0)