Skip to content

Commit b472e2c

Browse files
committed
Hot/cold page tracking using hardware counters
Based on the work from Vaibhav Jain <vaibhav@linux.ibm.com> and Sandipan Das <sandipan@linux.ibm.com> Not-yet-Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
1 parent 830b3c6 commit b472e2c

File tree

7 files changed

+632
-6
lines changed

7 files changed

+632
-6
lines changed

arch/powerpc/Kconfig

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1025,6 +1025,15 @@ config PPC_RTAS_FILTER
10251025
Say Y unless you know what you are doing and the filter is causing
10261026
problems for you.
10271027

1028+
config PPC_HCA_HOTNESS
1029+
prompt "PowerPC HCA engine based page hotness"
1030+
def_bool y
1031+
depends on PPC_BOOK3S_64 && LRU_GEN
1032+
help
1033+
Use HCA engine to find page hotness
1034+
1035+
If unsure, say N.
1036+
10281037
endmenu
10291038

10301039
config ISA_DMA_API

arch/powerpc/include/asm/hca.h

Lines changed: 239 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,239 @@
1+
// SPDX-License-Identifier: GPL-2.0-or-later
2+
3+
/*
4+
* Copyright 2021, Sandipan Das, IBM Corp.
5+
* Configuration helpers for the Hot-Cold Affinity helper
6+
*/
7+
8+
#ifndef _ASM_POWERPC_HCA_H
9+
#define _ASM_POWERPC_HCA_H
10+
11+
#include <linux/types.h>
12+
#include <linux/bitops.h>
13+
#include <linux/minmax.h>
14+
15+
#define KB (1024UL)
16+
#define MB (1024 * KB)
17+
#define GB (1024 * MB)
18+
#define TB (1024 * GB)
19+
20+
#define HCA_ENGINES_PER_CHIP 1 /* 2 */
21+
#define HCA_ENTRY_SIZE 8
22+
23+
#ifdef CONFIG_PPC_4K_PAGES
24+
#define HCA_PAGE_SIZE (4 * KB)
25+
#else /* CONFIG_PPC_64K_PAGES */
26+
#define HCA_PAGE_SIZE (64 * KB)
27+
#endif /* CONFIG_PPC_4K_PAGES */
28+
29+
/*
30+
* @m: The counter overflow mask
31+
*
32+
* Supported overflow masks are 16, 32, 64 ... 4096. The page stats in
33+
* the HCA cache are written back to memory once the count reaches @m.
34+
*/
35+
#define HCA_OVERFLOW_MASK(m) min((u64)4096, max((u64)16, (u64)roundup_pow_of_two(m)))
36+
#define HCA_OVERFLOW_MASK_DEFAULT 4096
37+
38+
/*
39+
* @m: The command sampling mode
40+
*
41+
* Supported command sampling modes are
42+
* 0 -> No sampling (capture all commands)
43+
* 1 -> Sample 1 of 16 commands
44+
* 2 -> Sample 1 of 32 commands
45+
* 3 -> Dynamic sampling (configured separately)
46+
*
47+
* The HCA fabric update traffic is reduced at the cost of accuracy. The
48+
* counts are scaled based on the sampling rate, i.e. if a single command
49+
* is seen when 1 of 16 mode is used, the corresponding page count will be
50+
* incremented by 16.
51+
*/
52+
#define HCA_SAMPLING_MODE(m) min((u64)3, max((u64)0, (u64)(m) & 0x3))
53+
#define HCA_SAMPLING_MODE_DEFAULT 3
54+
55+
/*
56+
* @p: The command sampling period (in cycles)
57+
*
58+
* Supported command sampling periods are 256, 512, 1024 ... 65536 cycles.
59+
* HCA update commands sent to the fabric are counted every @p cycles.
60+
*
61+
* Only used when dynamic sampling is enabled.
62+
* Actual period is = (value+1) * 256
63+
*
64+
*/
65+
#define HCA_SAMPLING_PERIOD(p) min((u64)65536, max((u64)256, (u64)roundup_pow_of_two(p)))
66+
#define HCA_SAMPLING_PERIOD_DEFAULT 0
67+
68+
/*
69+
* @t: The command threshold
70+
*
71+
* Supported command thresholds are 0, 1, 2 ... 255 commands.
72+
*
73+
* With the upper command threshold, the sampling rate will reduce when
74+
* more than @t number of update commands are detected within a sampling
75+
* period.
76+
*
77+
* With the lower command threshold, the sampling rate will increase when
78+
* fewer than @t number of update commands are detected within a sampling
79+
* period.
80+
*
81+
* Only used when dynamic sampling is enabled.
82+
*/
83+
#define HCA_SAMPLING_LOWER_THRESH(t) min((u64)255, (u64)(t))
84+
#define HCA_SAMPLING_LOWER_THRESH_DEFAULT 64UL
85+
86+
/*
87+
* @t: The command threshold
88+
*
89+
* Supported command thresholds are 0, 1, 2 ... 255 commands.
90+
*
91+
* With the upper command threshold, the sampling rate will reduce when
92+
* more than @t number of update commands are detected within a sampling
93+
* period.
94+
*
95+
* With the lower command threshold, the sampling rate will increase when
96+
* fewer than @t number of update commands are detected within a sampling
97+
* period.
98+
*
99+
* Only used when dynamic sampling is enabled.
100+
*/
101+
#define HCA_SAMPLING_UPPER_THRESH(t) min((u64)255, (u64)(t))
102+
#define HCA_SAMPLING_UPPER_THRESH_DEFAULT 255UL
103+
104+
/*
105+
* @s: The monitor region size (in bytes)
106+
*
107+
* Supported monitor region sizes are 16GB, 32GB, 64GB ... 512TB. The
108+
* minimum and maximum region sizes are always guaranteed to be 16GB
109+
* and 512TB respectively if the specified value is out of bounds.
110+
*/
111+
#define HCA_MONITOR_SIZE(s) min((u64)512 * TB, max((u64)16 * GB, (u64)roundup_pow_of_two(s)))
112+
//#define HCA_MONITOR_SIZE_DEFAULT (16 * GB)
113+
114+
/*
115+
* @b: The monitor region base
116+
* @s: The monitor region size (in bytes)
117+
*
118+
* The monitor region base address must be aligned to its size.
119+
*/
120+
#define HCA_MONITOR_BASE(b, s) ALIGN((u64)(b), HCA_MONITOR_SIZE(s))
121+
//#define HCA_MONITOR_BASE_DEFAULT 0
122+
123+
/*
124+
* @s: The monitor region size
125+
*
126+
* The counter region size is directly derived from the monitor region
127+
* size and the page size.
128+
*/
129+
#define HCA_COUNTER_SIZE(s) ((HCA_MONITOR_SIZE(s) * (u64)HCA_ENTRY_SIZE) / PAGE_SIZE)
130+
#define HCA_COUNTER_SIZE_DEFAULT 0
131+
#define HCA_COUNTER_BASE_DEFAULT 0
132+
133+
/*
134+
* @d: The decay delay (in ns)
135+
*
136+
* Decay delay defines the interval between updates to HCA cachelines of
137+
* 128 bytes. This parameter is not indicative of the absolute time taken
138+
* to apply decay updates to the entire counter region. However, that can
139+
* be derived from the configured decay delay.
140+
*
141+
* E.g. monitoring a 512GB region of 64kB pages requires a 64MB counter
142+
* region. To apply one round of decay updates to the entire region will
143+
* require (64M / 128) = 524288 HCA cache lines to be updated. If @d is
144+
* 2048, (524288 * 2048) ns = ~1.07s is required to update the entire
145+
* counter region.
146+
*
147+
* If the delay is set to 0, the decay feature is disabled. Otherwise,
148+
* supported decay delay periods are 16ns, 32ns, 64ns ... 147573952589s.
149+
* Since @d is in the nanosecond scale, representing the upper bound is
150+
* not possible with a 64-bit integer. Moreover, such large delays are
151+
* impractical for most intents and purposes. So, while the hardware can
152+
* support it, the maximum configurable decay delay is restricted to
153+
* 9223372036854775808ns. The minimum and maximum decay delays are always
154+
* guaranteed to be 32ns and 9223372036854775808ns respectively if the
155+
* specified value is out of bounds.
156+
*/
157+
#define HCA_DECAY_DELAY(d) ((d) ? min((uint64_t)9223372036854775808ULL, max((uint64_t)16, (uint64_t)roundup_pow_of_two(d))) : (uint64_t)0)
158+
/* 1 msec */
159+
#define HCA_DECAY_DELAY_DEFAULT 1
160+
161+
/* Entry constants and helpers */
162+
#define HCA_ENTRY_SIZE 8
163+
164+
/*
165+
* @v: The raw value of the entry
166+
* @s: The start of the bitfield
167+
* @n: The length of the bitfield
168+
*/
169+
#define HCA_ENTRY_FIELD(v, s, n) (((v) >> ( 64 - (s + n))) & ((1UL << (n)) - 1))
170+
171+
/*
172+
* The value of the HCA count is : 4^e * m
173+
* e = X[0:3], m = X[4:15]
174+
*/
175+
#define HCA_ENTRY_COUNT_EXP(e) HCA_ENTRY_FIELD((e), 0, 4)
176+
#define HCA_ENTRY_COUNT_MNT(e) HCA_ENTRY_FIELD((e), 4, 12)
177+
#define HCA_ENTRY_COUNT(e) ((1UL << (2 * HCA_ENTRY_COUNT_EXP(e))) * HCA_ENTRY_COUNT_MNT(e))
178+
179+
#define HCA_ENTRY_AGE(e) HCA_ENTRY_FIELD((e), 16, 3)
180+
181+
#define HCA_ENTRY_GEN(e) HCA_ENTRY_FIELD((e), 19, 1)
182+
183+
/*
184+
* The value of the HCA prev_count is : 4^e * m
185+
* e = X[0:3], m = X[4:11]
186+
*/
187+
#define HCA_ENTRY_PREV_COUNT_EXP_LENGTH 4
188+
#define HCA_ENTRY_PREV_COUNT_EXP_START 20
189+
#define HCA_ENTRY_PREV_COUNT_MNT_LENGTH 8
190+
#define HCA_ENTRY_PREV_COUNT_MNT_START 24
191+
#define HCA_ENTRY_PREV_COUNT_EXP(e) HCA_ENTRY_FIELD((e), 20, 4)
192+
#define HCA_ENTRY_PREV_COUNT_MNT(e) HCA_ENTRY_FIELD((e), 24, 8)
193+
#define HCA_ENTRY_PREV_COUNT(e) ((1UL << (2 * HCA_ENTRY_PREV_COUNT_EXP(e))) * HCA_ENTRY_PREV_COUNT_MNT(e))
194+
195+
#define HCA_ENTRY_TIMELOG_LENGTH 7
196+
#define HCA_ENTRY_TIMELOG_START 32
197+
#define HCA_ENTRY_TIMELOG(e) HCA_ENTRY_FIELD((e), 32, 7)
198+
199+
#define HCA_ENTRY_SOCKETID_COUNT 5
200+
#define HCA_ENTRY_SOCKETID_LENGTH 5
201+
#define HCA_ENTRY_SOCKETID_START(s) (39 + (s) * HCA_ENTRY_SOCKETID_LENGTH)
202+
#define HCA_ENTRY_SOCKETID(e, s) HCA_ENTRY_FIELD((e), HCA_ENTRY_SOCKETID_START(s), HCA_ENTRY_SOCKETID_LENGTH)
203+
204+
struct hca_entry {
205+
unsigned long count;
206+
uint8_t age;
207+
uint8_t gen;
208+
unsigned long prev_count;
209+
uint8_t timelog;
210+
uint8_t socketid[HCA_ENTRY_SOCKETID_COUNT];
211+
};
212+
213+
static inline unsigned long hotness_score(struct hca_entry * entry)
214+
{
215+
unsigned long hotness;
216+
217+
/*
218+
* Give more weightage to the prev_count because it got
219+
* historical values. Take smaller part of count as we
220+
* age more because prev_count would be a better approximation.
221+
* We still need to consider count to accomidate spike in access.
222+
* + 1 with age to handle age == 0.
223+
*/
224+
hotness = entry->prev_count + (entry->count / (entry->age + 1));
225+
226+
return hotness;
227+
}
228+
229+
extern bool hca_lru_age;
230+
extern bool hca_lru_evict;
231+
extern int (*hca_pfn_entry)(unsigned long pfn, struct hca_entry *entry);
232+
extern bool (*hca_node_enabled)(int numa_node);
233+
extern void (*hca_backend_node_debugfs_init)(int numa_node, struct dentry *node_dentry);
234+
extern void (*hca_backend_debugfs_init)(struct dentry *root_dentry);
235+
extern int (*hca_clear_entry)(unsigned long pfn);
236+
int map_hca_lru_seq(struct lruvec *lruvec, struct folio *folio);
237+
bool hca_try_to_inc_max_seq(struct lruvec *lruvec, unsigned long nr_to_scan, unsigned long max_seq);
238+
void restablish_hotness_range(int node);
239+
#endif /* _ASM_POWERPC_HCA_H */

arch/powerpc/include/asm/page.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -319,6 +319,11 @@ void arch_free_page(struct page *page, int order);
319319
#define HAVE_ARCH_FREE_PAGE
320320
#endif
321321

322+
#ifdef CONFIG_PPC_HCA_HOTNESS
323+
void arch_alloc_page(struct page *page, int order);
324+
#define HAVE_ARCH_ALLOC_PAGE
325+
#endif
326+
322327
struct vm_area_struct;
323328

324329
extern unsigned long kernstart_virt_addr;

arch/powerpc/mm/Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,3 +19,4 @@ obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o
1919
obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o
2020
obj-$(CONFIG_PTDUMP_CORE) += ptdump/
2121
obj-$(CONFIG_KASAN) += kasan/
22+
obj-$(CONFIG_PPC_HCA_HOTNESS) += hca.o

0 commit comments

Comments
 (0)