Commit 97b2782
writeback, memcg: Implement foreign dirty flushing
There's an inherent mismatch between memcg and writeback. The former
trackes ownership per-page while the latter per-inode. This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.
Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases. For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode. B owns the inode and
A's memory is limited far below B's. A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback. A will be slowed down without
a way to make writeback of the dirty pages happen.
This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages. Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.
A reproducer follows.
write-range.c::
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
#include <sys/types.h>
static const char *usage = "write-range FILE START SIZE\n";
int main(int argc, char **argv)
{
int fd;
unsigned long start, size, end, pos;
char *endp;
char buf[4096];
if (argc < 4) {
fprintf(stderr, usage);
return 1;
}
fd = open(argv[1], O_WRONLY);
if (fd < 0) {
perror("open");
return 1;
}
start = strtoul(argv[2], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
size = strtoul(argv[3], &endp, 0);
if (*endp != '\0') {
fprintf(stderr, usage);
return 1;
}
end = start + size;
while (1) {
for (pos = start; pos < end; ) {
long bread, bwritten = 0;
if (lseek(fd, pos, SEEK_SET) < 0) {
perror("lseek");
return 1;
}
bread = read(0, buf, sizeof(buf) < end - pos ?
sizeof(buf) : end - pos);
if (bread < 0) {
perror("read");
return 1;
}
if (bread == 0)
return 0;
while (bwritten < bread) {
long this;
this = write(fd, buf + bwritten,
bread - bwritten);
if (this < 0) {
perror("write");
return 1;
}
bwritten += this;
pos += bwritten;
}
}
}
}
repro.sh::
#!/bin/bash
set -e
set -x
sysctl -w vm.dirty_expire_centisecs=300000
sysctl -w vm.dirty_writeback_centisecs=300000
sysctl -w vm.dirtytime_expire_seconds=300000
echo 3 > /proc/sys/vm/drop_caches
TEST=/sys/fs/cgroup/test
A=$TEST/A
B=$TEST/B
mkdir -p $A $B
echo "+memory +io" > $TEST/cgroup.subtree_control
echo $((1<<30)) > $A/memory.high
echo $((32<<30)) > $B/memory.high
rm -f testfile
touch testfile
fallocate -l 4G testfile
echo "Starting B"
(echo $BASHPID > $B/cgroup.procs
pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &
echo "Waiting 10s to ensure B claims the testfile inode"
sleep 5
sync
sleep 5
sync
echo "Starting A"
(echo $BASHPID > $A/cgroup.procs
pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))
v2: Added comments explaining why the specific intervals are being used.
v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
flushing while avoding possible livelocks.
v4: Use get_jiffies_64() and time_before/after64() instead of raw
jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Jens Axboe <axboe@kernel.dk>1 parent d62241c commit 97b2782
File tree
4 files changed
+178
-0
lines changed- include/linux
- mm
4 files changed
+178
-0
lines changed| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
63 | 63 | | |
64 | 64 | | |
65 | 65 | | |
| 66 | + | |
66 | 67 | | |
67 | 68 | | |
68 | 69 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
183 | 183 | | |
184 | 184 | | |
185 | 185 | | |
| 186 | + | |
| 187 | + | |
| 188 | + | |
| 189 | + | |
| 190 | + | |
| 191 | + | |
| 192 | + | |
| 193 | + | |
| 194 | + | |
| 195 | + | |
| 196 | + | |
| 197 | + | |
| 198 | + | |
| 199 | + | |
| 200 | + | |
| 201 | + | |
| 202 | + | |
186 | 203 | | |
187 | 204 | | |
188 | 205 | | |
| |||
307 | 324 | | |
308 | 325 | | |
309 | 326 | | |
| 327 | + | |
310 | 328 | | |
311 | 329 | | |
312 | 330 | | |
| |||
1218 | 1236 | | |
1219 | 1237 | | |
1220 | 1238 | | |
| 1239 | + | |
| 1240 | + | |
| 1241 | + | |
| 1242 | + | |
| 1243 | + | |
| 1244 | + | |
| 1245 | + | |
| 1246 | + | |
| 1247 | + | |
| 1248 | + | |
| 1249 | + | |
| 1250 | + | |
1221 | 1251 | | |
1222 | 1252 | | |
1223 | 1253 | | |
| |||
1233 | 1263 | | |
1234 | 1264 | | |
1235 | 1265 | | |
| 1266 | + | |
| 1267 | + | |
| 1268 | + | |
| 1269 | + | |
| 1270 | + | |
| 1271 | + | |
| 1272 | + | |
| 1273 | + | |
| 1274 | + | |
1236 | 1275 | | |
1237 | 1276 | | |
1238 | 1277 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
87 | 87 | | |
88 | 88 | | |
89 | 89 | | |
| 90 | + | |
| 91 | + | |
| 92 | + | |
| 93 | + | |
90 | 94 | | |
91 | 95 | | |
92 | 96 | | |
| |||
4145 | 4149 | | |
4146 | 4150 | | |
4147 | 4151 | | |
| 4152 | + | |
| 4153 | + | |
| 4154 | + | |
| 4155 | + | |
| 4156 | + | |
| 4157 | + | |
| 4158 | + | |
| 4159 | + | |
| 4160 | + | |
| 4161 | + | |
| 4162 | + | |
| 4163 | + | |
| 4164 | + | |
| 4165 | + | |
| 4166 | + | |
| 4167 | + | |
| 4168 | + | |
| 4169 | + | |
| 4170 | + | |
| 4171 | + | |
| 4172 | + | |
| 4173 | + | |
| 4174 | + | |
| 4175 | + | |
| 4176 | + | |
| 4177 | + | |
| 4178 | + | |
| 4179 | + | |
| 4180 | + | |
| 4181 | + | |
| 4182 | + | |
| 4183 | + | |
| 4184 | + | |
| 4185 | + | |
| 4186 | + | |
| 4187 | + | |
| 4188 | + | |
| 4189 | + | |
| 4190 | + | |
| 4191 | + | |
| 4192 | + | |
| 4193 | + | |
| 4194 | + | |
| 4195 | + | |
| 4196 | + | |
| 4197 | + | |
| 4198 | + | |
| 4199 | + | |
| 4200 | + | |
| 4201 | + | |
| 4202 | + | |
| 4203 | + | |
| 4204 | + | |
| 4205 | + | |
| 4206 | + | |
| 4207 | + | |
| 4208 | + | |
| 4209 | + | |
| 4210 | + | |
| 4211 | + | |
| 4212 | + | |
| 4213 | + | |
| 4214 | + | |
| 4215 | + | |
| 4216 | + | |
| 4217 | + | |
| 4218 | + | |
| 4219 | + | |
| 4220 | + | |
| 4221 | + | |
| 4222 | + | |
| 4223 | + | |
| 4224 | + | |
| 4225 | + | |
| 4226 | + | |
| 4227 | + | |
| 4228 | + | |
| 4229 | + | |
| 4230 | + | |
| 4231 | + | |
| 4232 | + | |
| 4233 | + | |
| 4234 | + | |
| 4235 | + | |
| 4236 | + | |
| 4237 | + | |
| 4238 | + | |
| 4239 | + | |
| 4240 | + | |
| 4241 | + | |
| 4242 | + | |
| 4243 | + | |
| 4244 | + | |
| 4245 | + | |
| 4246 | + | |
| 4247 | + | |
| 4248 | + | |
| 4249 | + | |
| 4250 | + | |
| 4251 | + | |
| 4252 | + | |
| 4253 | + | |
| 4254 | + | |
| 4255 | + | |
| 4256 | + | |
| 4257 | + | |
| 4258 | + | |
| 4259 | + | |
| 4260 | + | |
| 4261 | + | |
| 4262 | + | |
| 4263 | + | |
| 4264 | + | |
| 4265 | + | |
| 4266 | + | |
| 4267 | + | |
| 4268 | + | |
| 4269 | + | |
| 4270 | + | |
| 4271 | + | |
| 4272 | + | |
4148 | 4273 | | |
4149 | 4274 | | |
4150 | 4275 | | |
| |||
4661 | 4786 | | |
4662 | 4787 | | |
4663 | 4788 | | |
| 4789 | + | |
4664 | 4790 | | |
4665 | 4791 | | |
4666 | 4792 | | |
| |||
4704 | 4830 | | |
4705 | 4831 | | |
4706 | 4832 | | |
| 4833 | + | |
| 4834 | + | |
| 4835 | + | |
4707 | 4836 | | |
4708 | 4837 | | |
4709 | 4838 | | |
| |||
4833 | 4962 | | |
4834 | 4963 | | |
4835 | 4964 | | |
| 4965 | + | |
4836 | 4966 | | |
| 4967 | + | |
| 4968 | + | |
| 4969 | + | |
| 4970 | + | |
4837 | 4971 | | |
4838 | 4972 | | |
4839 | 4973 | | |
| |||
| Original file line number | Diff line number | Diff line change | |
|---|---|---|---|
| |||
1667 | 1667 | | |
1668 | 1668 | | |
1669 | 1669 | | |
| 1670 | + | |
| 1671 | + | |
1670 | 1672 | | |
1671 | 1673 | | |
1672 | 1674 | | |
| |||
2427 | 2429 | | |
2428 | 2430 | | |
2429 | 2431 | | |
| 2432 | + | |
| 2433 | + | |
2430 | 2434 | | |
2431 | 2435 | | |
2432 | 2436 | | |
| |||
0 commit comments