Skip to content

Commit a6fde7a

Browse files
Barry Songakpm00
authored andcommitted
mm: use per_vma lock for MADV_DONTNEED
Certain madvise operations, especially MADV_DONTNEED, occur far more frequently than other madvise options, particularly in native and Java heaps for dynamic memory management. Currently, the mmap_lock is always held during these operations, even when unnecessary. This causes lock contention and can lead to severe priority inversion, where low-priority threads—such as Android's HeapTaskDaemon— hold the lock and block higher-priority threads. This patch enables the use of per-VMA locks when the advised range lies entirely within a single VMA, avoiding the need for full VMA traversal. In practice, userspace heaps rarely issue MADV_DONTNEED across multiple VMAs. Tangquan's testing shows that over 99.5% of memory reclaimed by Android benefits from this per-VMA lock optimization. After extended runtime, 217,735 madvise calls from HeapTaskDaemon used the per-VMA path, while only 1,231 fell back to mmap_lock. To simplify handling, the implementation falls back to the standard mmap_lock if userfaultfd is enabled on the VMA, avoiding the complexity of userfaultfd_remove(). Many thanks to Lorenzo's work[1] on "mm/madvise: support VMA read locks for MADV_DONTNEED[_LOCKED]" Then use this mechanism to permit VMA locking to be done later in the madvise() logic and also to allow altering of the locking mode to permit falling back to an mmap read lock if required." One important point, as pointed out by Jann[2], is that untagged_addr_remote() requires holding mmap_lock. This is because address tagging on x86 and RISC-V is quite complex. Until untagged_addr_remote() becomes atomic—which seems unlikely in the near future—we cannot support per-VMA locks for remote processes. So for now, only local processes are supported. Lance said: : Just to put some numbers on it, I ran a micro-benchmark with 100 : parallel threads, where each thread calls madvise() on its own 1GiB : chunk of 64KiB mTHP-backed memory. The performance gain is huge: : : 1) MADV_DONTNEED saw its average time drop from 0.0508s to 0.0270s : (~47% faster) : : 2) MADV_FREE saw its average time drop from 0.3078s to 0.1095s (~64% : faster) [lorenzo.stoakes@oracle.com: avoid any chance of uninitialised pointer deref] Link: https://lkml.kernel.org/r/309d22ca-6cd9-4601-8402-d441a07d9443@lucifer.local Link: https://lore.kernel.org/all/0b96ce61-a52c-4036-b5b6-5c50783db51f@lucifer.local/ [1] Link: https://lore.kernel.org/all/CAG48ez11zi-1jicHUZtLhyoNPGGVB+ROeAJCUw48bsjk4bbEkA@mail.gmail.com/ [2] Link: https://lkml.kernel.org/r/20250607220150.2980-1-21cnbao@gmail.com Signed-off-by: Barry Song <v-songbaohua@oppo.com> Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Reviewed-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Acked-by: Qi Zheng <zhengqi.arch@bytedance.com> Cc: "Liam R. Howlett" <Liam.Howlett@oracle.com> Cc: David Hildenbrand <david@redhat.com> Cc: Vlastimil Babka <vbabka@suse.cz> Cc: Jann Horn <jannh@google.com> Cc: Suren Baghdasaryan <surenb@google.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Tangquan Zheng <zhengtangquan@oppo.com> Cc: Lance Yang <ioworker0@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
1 parent 5e00e31 commit a6fde7a

File tree

1 file changed

+150
-48
lines changed

1 file changed

+150
-48
lines changed

mm/madvise.c

Lines changed: 150 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -48,38 +48,19 @@ struct madvise_walk_private {
4848
bool pageout;
4949
};
5050

51+
enum madvise_lock_mode {
52+
MADVISE_NO_LOCK,
53+
MADVISE_MMAP_READ_LOCK,
54+
MADVISE_MMAP_WRITE_LOCK,
55+
MADVISE_VMA_READ_LOCK,
56+
};
57+
5158
struct madvise_behavior {
5259
int behavior;
5360
struct mmu_gather *tlb;
61+
enum madvise_lock_mode lock_mode;
5462
};
5563

56-
/*
57-
* Any behaviour which results in changes to the vma->vm_flags needs to
58-
* take mmap_lock for writing. Others, which simply traverse vmas, need
59-
* to only take it for reading.
60-
*/
61-
static int madvise_need_mmap_write(int behavior)
62-
{
63-
switch (behavior) {
64-
case MADV_REMOVE:
65-
case MADV_WILLNEED:
66-
case MADV_DONTNEED:
67-
case MADV_DONTNEED_LOCKED:
68-
case MADV_COLD:
69-
case MADV_PAGEOUT:
70-
case MADV_FREE:
71-
case MADV_POPULATE_READ:
72-
case MADV_POPULATE_WRITE:
73-
case MADV_COLLAPSE:
74-
case MADV_GUARD_INSTALL:
75-
case MADV_GUARD_REMOVE:
76-
return 0;
77-
default:
78-
/* be safe, default to 1. list exceptions explicitly */
79-
return 1;
80-
}
81-
}
82-
8364
#ifdef CONFIG_ANON_VMA_NAME
8465
struct anon_vma_name *anon_vma_name_alloc(const char *name)
8566
{
@@ -1339,6 +1320,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
13391320
return madvise_guard_remove(vma, prev, start, end);
13401321
}
13411322

1323+
/* We cannot provide prev in this lock mode. */
1324+
VM_WARN_ON_ONCE(arg->lock_mode == MADVISE_VMA_READ_LOCK);
13421325
anon_name = anon_vma_name(vma);
13431326
anon_vma_name_get(anon_name);
13441327
error = madvise_update_vma(vma, prev, start, end, new_flags,
@@ -1488,6 +1471,44 @@ static bool process_madvise_remote_valid(int behavior)
14881471
}
14891472
}
14901473

1474+
/*
1475+
* Try to acquire a VMA read lock if possible.
1476+
*
1477+
* We only support this lock over a single VMA, which the input range must
1478+
* span either partially or fully.
1479+
*
1480+
* This function always returns with an appropriate lock held. If a VMA read
1481+
* lock could be acquired, we return the locked VMA.
1482+
*
1483+
* If a VMA read lock could not be acquired, we return NULL and expect caller to
1484+
* fallback to mmap lock behaviour.
1485+
*/
1486+
static struct vm_area_struct *try_vma_read_lock(struct mm_struct *mm,
1487+
struct madvise_behavior *madv_behavior,
1488+
unsigned long start, unsigned long end)
1489+
{
1490+
struct vm_area_struct *vma;
1491+
1492+
vma = lock_vma_under_rcu(mm, start);
1493+
if (!vma)
1494+
goto take_mmap_read_lock;
1495+
/*
1496+
* Must span only a single VMA; uffd and remote processes are
1497+
* unsupported.
1498+
*/
1499+
if (end > vma->vm_end || current->mm != mm ||
1500+
userfaultfd_armed(vma)) {
1501+
vma_end_read(vma);
1502+
goto take_mmap_read_lock;
1503+
}
1504+
return vma;
1505+
1506+
take_mmap_read_lock:
1507+
mmap_read_lock(mm);
1508+
madv_behavior->lock_mode = MADVISE_MMAP_READ_LOCK;
1509+
return NULL;
1510+
}
1511+
14911512
/*
14921513
* Walk the vmas in range [start,end), and call the visit function on each one.
14931514
* The visit function will get start and end parameters that cover the overlap
@@ -1498,7 +1519,8 @@ static bool process_madvise_remote_valid(int behavior)
14981519
*/
14991520
static
15001521
int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
1501-
unsigned long end, void *arg,
1522+
unsigned long end, struct madvise_behavior *madv_behavior,
1523+
void *arg,
15021524
int (*visit)(struct vm_area_struct *vma,
15031525
struct vm_area_struct **prev, unsigned long start,
15041526
unsigned long end, void *arg))
@@ -1507,6 +1529,21 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
15071529
struct vm_area_struct *prev;
15081530
unsigned long tmp;
15091531
int unmapped_error = 0;
1532+
int error;
1533+
1534+
/*
1535+
* If VMA read lock is supported, apply madvise to a single VMA
1536+
* tentatively, avoiding walking VMAs.
1537+
*/
1538+
if (madv_behavior && madv_behavior->lock_mode == MADVISE_VMA_READ_LOCK) {
1539+
vma = try_vma_read_lock(mm, madv_behavior, start, end);
1540+
if (vma) {
1541+
prev = vma;
1542+
error = visit(vma, &prev, start, end, arg);
1543+
vma_end_read(vma);
1544+
return error;
1545+
}
1546+
}
15101547

15111548
/*
15121549
* If the interval [start,end) covers some unmapped address
@@ -1518,8 +1555,6 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
15181555
prev = vma;
15191556

15201557
for (;;) {
1521-
int error;
1522-
15231558
/* Still start < end. */
15241559
if (!vma)
15251560
return -ENOMEM;
@@ -1600,34 +1635,86 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
16001635
if (end == start)
16011636
return 0;
16021637

1603-
return madvise_walk_vmas(mm, start, end, anon_name,
1638+
return madvise_walk_vmas(mm, start, end, NULL, anon_name,
16041639
madvise_vma_anon_name);
16051640
}
16061641
#endif /* CONFIG_ANON_VMA_NAME */
16071642

1608-
static int madvise_lock(struct mm_struct *mm, int behavior)
1643+
1644+
/*
1645+
* Any behaviour which results in changes to the vma->vm_flags needs to
1646+
* take mmap_lock for writing. Others, which simply traverse vmas, need
1647+
* to only take it for reading.
1648+
*/
1649+
static enum madvise_lock_mode get_lock_mode(struct madvise_behavior *madv_behavior)
16091650
{
1651+
int behavior = madv_behavior->behavior;
1652+
16101653
if (is_memory_failure(behavior))
1611-
return 0;
1654+
return MADVISE_NO_LOCK;
16121655

1613-
if (madvise_need_mmap_write(behavior)) {
1656+
switch (behavior) {
1657+
case MADV_REMOVE:
1658+
case MADV_WILLNEED:
1659+
case MADV_COLD:
1660+
case MADV_PAGEOUT:
1661+
case MADV_FREE:
1662+
case MADV_POPULATE_READ:
1663+
case MADV_POPULATE_WRITE:
1664+
case MADV_COLLAPSE:
1665+
case MADV_GUARD_INSTALL:
1666+
case MADV_GUARD_REMOVE:
1667+
return MADVISE_MMAP_READ_LOCK;
1668+
case MADV_DONTNEED:
1669+
case MADV_DONTNEED_LOCKED:
1670+
return MADVISE_VMA_READ_LOCK;
1671+
default:
1672+
return MADVISE_MMAP_WRITE_LOCK;
1673+
}
1674+
}
1675+
1676+
static int madvise_lock(struct mm_struct *mm,
1677+
struct madvise_behavior *madv_behavior)
1678+
{
1679+
enum madvise_lock_mode lock_mode = get_lock_mode(madv_behavior);
1680+
1681+
switch (lock_mode) {
1682+
case MADVISE_NO_LOCK:
1683+
break;
1684+
case MADVISE_MMAP_WRITE_LOCK:
16141685
if (mmap_write_lock_killable(mm))
16151686
return -EINTR;
1616-
} else {
1687+
break;
1688+
case MADVISE_MMAP_READ_LOCK:
16171689
mmap_read_lock(mm);
1690+
break;
1691+
case MADVISE_VMA_READ_LOCK:
1692+
/* We will acquire the lock per-VMA in madvise_walk_vmas(). */
1693+
break;
16181694
}
1695+
1696+
madv_behavior->lock_mode = lock_mode;
16191697
return 0;
16201698
}
16211699

1622-
static void madvise_unlock(struct mm_struct *mm, int behavior)
1700+
static void madvise_unlock(struct mm_struct *mm,
1701+
struct madvise_behavior *madv_behavior)
16231702
{
1624-
if (is_memory_failure(behavior))
1703+
switch (madv_behavior->lock_mode) {
1704+
case MADVISE_NO_LOCK:
16251705
return;
1626-
1627-
if (madvise_need_mmap_write(behavior))
1706+
case MADVISE_MMAP_WRITE_LOCK:
16281707
mmap_write_unlock(mm);
1629-
else
1708+
break;
1709+
case MADVISE_MMAP_READ_LOCK:
16301710
mmap_read_unlock(mm);
1711+
break;
1712+
case MADVISE_VMA_READ_LOCK:
1713+
/* We will drop the lock per-VMA in madvise_walk_vmas(). */
1714+
break;
1715+
}
1716+
1717+
madv_behavior->lock_mode = MADVISE_NO_LOCK;
16311718
}
16321719

16331720
static bool madvise_batch_tlb_flush(int behavior)
@@ -1712,6 +1799,21 @@ static bool is_madvise_populate(int behavior)
17121799
}
17131800
}
17141801

1802+
/*
1803+
* untagged_addr_remote() assumes mmap_lock is already held. On
1804+
* architectures like x86 and RISC-V, tagging is tricky because each
1805+
* mm may have a different tagging mask. However, we might only hold
1806+
* the per-VMA lock (currently only local processes are supported),
1807+
* so untagged_addr is used to avoid the mmap_lock assertion for
1808+
* local processes.
1809+
*/
1810+
static inline unsigned long get_untagged_addr(struct mm_struct *mm,
1811+
unsigned long start)
1812+
{
1813+
return current->mm == mm ? untagged_addr(start) :
1814+
untagged_addr_remote(mm, start);
1815+
}
1816+
17151817
static int madvise_do_behavior(struct mm_struct *mm,
17161818
unsigned long start, size_t len_in,
17171819
struct madvise_behavior *madv_behavior)
@@ -1723,15 +1825,15 @@ static int madvise_do_behavior(struct mm_struct *mm,
17231825

17241826
if (is_memory_failure(behavior))
17251827
return madvise_inject_error(behavior, start, start + len_in);
1726-
start = untagged_addr_remote(mm, start);
1828+
start = get_untagged_addr(mm, start);
17271829
end = start + PAGE_ALIGN(len_in);
17281830

17291831
blk_start_plug(&plug);
17301832
if (is_madvise_populate(behavior))
17311833
error = madvise_populate(mm, start, end, behavior);
17321834
else
17331835
error = madvise_walk_vmas(mm, start, end, madv_behavior,
1734-
madvise_vma_behavior);
1836+
madv_behavior, madvise_vma_behavior);
17351837
blk_finish_plug(&plug);
17361838
return error;
17371839
}
@@ -1819,13 +1921,13 @@ int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int beh
18191921

18201922
if (madvise_should_skip(start, len_in, behavior, &error))
18211923
return error;
1822-
error = madvise_lock(mm, behavior);
1924+
error = madvise_lock(mm, &madv_behavior);
18231925
if (error)
18241926
return error;
18251927
madvise_init_tlb(&madv_behavior, mm);
18261928
error = madvise_do_behavior(mm, start, len_in, &madv_behavior);
18271929
madvise_finish_tlb(&madv_behavior);
1828-
madvise_unlock(mm, behavior);
1930+
madvise_unlock(mm, &madv_behavior);
18291931

18301932
return error;
18311933
}
@@ -1849,7 +1951,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
18491951

18501952
total_len = iov_iter_count(iter);
18511953

1852-
ret = madvise_lock(mm, behavior);
1954+
ret = madvise_lock(mm, &madv_behavior);
18531955
if (ret)
18541956
return ret;
18551957
madvise_init_tlb(&madv_behavior, mm);
@@ -1882,8 +1984,8 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
18821984

18831985
/* Drop and reacquire lock to unwind race. */
18841986
madvise_finish_tlb(&madv_behavior);
1885-
madvise_unlock(mm, behavior);
1886-
ret = madvise_lock(mm, behavior);
1987+
madvise_unlock(mm, &madv_behavior);
1988+
ret = madvise_lock(mm, &madv_behavior);
18871989
if (ret)
18881990
goto out;
18891991
madvise_init_tlb(&madv_behavior, mm);
@@ -1894,7 +1996,7 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter,
18941996
iov_iter_advance(iter, iter_iov_len(iter));
18951997
}
18961998
madvise_finish_tlb(&madv_behavior);
1897-
madvise_unlock(mm, behavior);
1999+
madvise_unlock(mm, &madv_behavior);
18982000

18992001
out:
19002002
ret = (total_len - iov_iter_count(iter)) ? : ret;

0 commit comments

Comments
 (0)