From f8f55e9ec73f0a07e55fd91ce82fdca0796ad66a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 24 Sep 2024 19:59:11 +0100 Subject: [PATCH 001/215] selftests/mm: add pkey_sighandler_xx, hugetlb_dio to .gitignore Commit 6998a73efbb8 ("selftests/mm: Add new testcases for pkeys") and commit 3a103b5315b7 ("selftest: mm: Test if hugepage does not get leaked during __bio_release_pages()") generate test binaries hugetlb_dio, pkey_sighandler_tests_32 and pkey_sighandler_tests_64 but did not add these to .gitignore. Correct this. Link: https://lkml.kernel.org/r/20240924185911.117937-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Donet Tom Cc: Keith Lucas Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index da030b43e43be..689bbd5202964 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -51,3 +51,6 @@ hugetlb_madv_vs_map mseal_test seal_elf droppable +hugetlb_dio +pkey_sighandler_tests_32 +pkey_sighandler_tests_64 From d2d243df445a88c26e91eac02b041213c7a32e9e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sun, 22 Sep 2024 12:32:13 +0800 Subject: [PATCH 002/215] mm: shmem: fix khugepaged activation policy for shmem Shmem has a separate interface (different from anonymous pages) to control huge page allocation, that means shmem THP can be enabled while anonymous THP is disabled. However, in this case, khugepaged will not start to collapse shmem THP, which is unreasonable. To fix this issue, we should call start_stop_khugepaged() to activate or deactivate the khugepaged thread when setting shmem mTHP interfaces. Moreover, add a new helper shmem_hpage_pmd_enabled() to help to check whether shmem THP is enabled, which will determine if khugepaged should be activated. Link: https://lkml.kernel.org/r/9b9c6cbc4499bf44c6455367fd9e0f6036525680.1726978977.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reported-by: Ryan Roberts Reviewed-by: Ryan Roberts Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 6 ++++++ mm/khugepaged.c | 6 +++++- mm/shmem.c | 29 +++++++++++++++++++++++++++-- 3 files changed, 38 insertions(+), 3 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 515a9a6a3c6f8..ee6635052383e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -114,6 +114,7 @@ int shmem_unuse(unsigned int type); unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force); +bool shmem_hpage_pmd_enabled(void); #else static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, @@ -121,6 +122,11 @@ static inline unsigned long shmem_allowable_huge_orders(struct inode *inode, { return 0; } + +static inline bool shmem_hpage_pmd_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_SHMEM diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b538c3d48386a..0bd80e134010f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -416,9 +416,11 @@ static inline int hpage_collapse_test_exit_or_disable(struct mm_struct *mm) static bool hugepage_pmd_enabled(void) { /* - * We cover both the anon and the file-backed case here; file-backed + * We cover the anon, shmem and the file-backed case here; file-backed * hugepages, when configured in, are determined by the global control. * Anon pmd-sized hugepages are determined by the pmd-size control. + * Shmem pmd-sized hugepages are also determined by its pmd-size control, + * except when the global shmem_huge is set to SHMEM_HUGE_DENY. */ if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && hugepage_global_enabled()) @@ -430,6 +432,8 @@ static bool hugepage_pmd_enabled(void) if (test_bit(PMD_ORDER, &huge_anon_orders_inherit) && hugepage_global_enabled()) return true; + if (IS_ENABLED(CONFIG_SHMEM) && shmem_hpage_pmd_enabled()) + return true; return false; } diff --git a/mm/shmem.c b/mm/shmem.c index e87f5d6799a7b..6ad50ba60d8ef 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1655,6 +1655,23 @@ static gfp_t limit_gfp_mask(gfp_t huge_gfp, gfp_t limit_gfp) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE +bool shmem_hpage_pmd_enabled(void) +{ + if (shmem_huge == SHMEM_HUGE_DENY) + return false; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_always)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_madvise)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_within_size)) + return true; + if (test_bit(HPAGE_PMD_ORDER, &huge_shmem_orders_inherit) && + shmem_huge != SHMEM_HUGE_NEVER) + return true; + + return false; +} + unsigned long shmem_allowable_huge_orders(struct inode *inode, struct vm_area_struct *vma, pgoff_t index, loff_t write_end, bool shmem_huge_force) @@ -5024,7 +5041,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { char tmp[16]; - int huge; + int huge, err; if (count + 1 > sizeof(tmp)) return -EINVAL; @@ -5048,7 +5065,9 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) SHMEM_SB(shm_mnt->mnt_sb)->huge = shmem_huge; - return count; + + err = start_stop_khugepaged(); + return err ? err : count; } struct kobj_attribute shmem_enabled_attr = __ATTR_RW(shmem_enabled); @@ -5125,6 +5144,12 @@ static ssize_t thpsize_shmem_enabled_store(struct kobject *kobj, ret = -EINVAL; } + if (ret > 0) { + int err = start_stop_khugepaged(); + + if (err) + ret = err; + } return ret; } From ba7196e566516f798635e26e976ae44f708d9d54 Mon Sep 17 00:00:00 2001 From: Leo Stone Date: Sun, 22 Sep 2024 19:25:18 -0700 Subject: [PATCH 003/215] mm/damon: fix sparse warning for zero initializer sparse warns about zero initializing an array with {0,}, change it to the equivalent {0}. Fixes the sparse warning: mm/damon/tests/vaddr-kunit.h:69:47: warning: missing braces around initializer Link: https://lkml.kernel.org/r/xriwklcwjpwcz7eiavo6f7envdar4jychhsk6sfkj5klaznb6b@j6vrvr2sxjht Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: Leo Stone Reviewed-by: SeongJae Park Cc: Jinjie Ruan Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index a339d117150fb..3dad8dfd9005f 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -66,7 +66,7 @@ static int __link_vmas(struct maple_tree *mt, struct vm_area_struct *vmas, static void damon_test_three_regions_in_vmas(struct kunit *test) { static struct mm_struct mm; - struct damon_addr_range regions[3] = {0,}; + struct damon_addr_range regions[3] = {0}; /* 10-20-25, 200-210-220, 300-305, 307-330 */ struct vm_area_struct vmas[] = { (struct vm_area_struct) {.vm_start = 10, .vm_end = 20}, From 15ff4d409e1a6f939d94d2005ae275c26b2b0d9d Mon Sep 17 00:00:00 2001 From: Jingxiang Zeng Date: Fri, 30 Aug 2024 16:22:44 +0800 Subject: [PATCH 004/215] mm/memcontrol: add per-memcg pgpgin/pswpin counter In proactive memory reclamation scenarios, it is necessary to estimate the pswpin and pswpout metrics of the cgroup to determine whether to continue reclaiming anonymous pages in the current batch. This patch will collect these metrics and expose them. [linuszeng@tencent.com: v2] Link: https://lkml.kernel.org/r/20240830082244.156923-1-jingxiangzeng.cas@gmail.com Li nk: https://lkml.kernel.org/r/20240913084453.3605621-1-jingxiangzeng.cas@gmail.com Link: https://lkml.kernel.org/r/20240830082244.156923-1-jingxiangzeng.cas@gmail.com Signed-off-by: Jingxiang Zeng Acked-by: Nhat Pham Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 ++ mm/page_io.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 06df2af974159..d6159266185f5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -418,6 +418,8 @@ static const unsigned int memcg_vm_event_stat[] = { PGPGIN, PGPGOUT, #endif + PSWPIN, + PSWPOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSCAN_KHUGEPAGED, diff --git a/mm/page_io.c b/mm/page_io.c index 69536a2b3c138..40392782cdcb9 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -290,6 +290,7 @@ static inline void count_swpout_vm_event(struct folio *folio) } count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif + count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } @@ -485,6 +486,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); } @@ -570,6 +572,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); __end_swap_bio_read(&bio); @@ -585,6 +588,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); } From 9e9e085effe9b7e342138fde3cf8577d22509932 Mon Sep 17 00:00:00 2001 From: Adrian Huang Date: Sat, 27 Jul 2024 00:52:46 +0800 Subject: [PATCH 005/215] mm/vmalloc: combine all TLB flush operations of KASAN shadow virtual address into one operation When compiling kernel source 'make -j $(nproc)' with the up-and-running KASAN-enabled kernel on a 256-core machine, the following soft lockup is shown: watchdog: BUG: soft lockup - CPU#28 stuck for 22s! [kworker/28:1:1760] CPU: 28 PID: 1760 Comm: kworker/28:1 Kdump: loaded Not tainted 6.10.0-rc5 #95 Workqueue: events drain_vmap_area_work RIP: 0010:smp_call_function_many_cond+0x1d8/0xbb0 Code: 38 c8 7c 08 84 c9 0f 85 49 08 00 00 8b 45 08 a8 01 74 2e 48 89 f1 49 89 f7 48 c1 e9 03 41 83 e7 07 4c 01 e9 41 83 c7 03 f3 90 <0f> b6 01 41 38 c7 7c 08 84 c0 0f 85 d4 06 00 00 8b 45 08 a8 01 75 RSP: 0018:ffffc9000cb3fb60 EFLAGS: 00000202 RAX: 0000000000000011 RBX: ffff8883bc4469c0 RCX: ffffed10776e9949 RDX: 0000000000000002 RSI: ffff8883bb74ca48 RDI: ffffffff8434dc50 RBP: ffff8883bb74ca40 R08: ffff888103585dc0 R09: ffff8884533a1800 R10: 0000000000000004 R11: ffffffffffffffff R12: ffffed1077888d39 R13: dffffc0000000000 R14: ffffed1077888d38 R15: 0000000000000003 FS: 0000000000000000(0000) GS:ffff8883bc400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00005577b5c8d158 CR3: 0000000004850000 CR4: 0000000000350ef0 Call Trace: ? watchdog_timer_fn+0x2cd/0x390 ? __pfx_watchdog_timer_fn+0x10/0x10 ? __hrtimer_run_queues+0x300/0x6d0 ? sched_clock_cpu+0x69/0x4e0 ? __pfx___hrtimer_run_queues+0x10/0x10 ? srso_return_thunk+0x5/0x5f ? ktime_get_update_offsets_now+0x7f/0x2a0 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f ? hrtimer_interrupt+0x2ca/0x760 ? __sysvec_apic_timer_interrupt+0x8c/0x2b0 ? sysvec_apic_timer_interrupt+0x6a/0x90 ? asm_sysvec_apic_timer_interrupt+0x16/0x20 ? smp_call_function_many_cond+0x1d8/0xbb0 ? __pfx_do_kernel_range_flush+0x10/0x10 on_each_cpu_cond_mask+0x20/0x40 flush_tlb_kernel_range+0x19b/0x250 ? srso_return_thunk+0x5/0x5f ? kasan_release_vmalloc+0xa7/0xc0 purge_vmap_node+0x357/0x820 ? __pfx_purge_vmap_node+0x10/0x10 __purge_vmap_area_lazy+0x5b8/0xa10 drain_vmap_area_work+0x21/0x30 process_one_work+0x661/0x10b0 worker_thread+0x844/0x10e0 ? srso_return_thunk+0x5/0x5f ? __kthread_parkme+0x82/0x140 ? __pfx_worker_thread+0x10/0x10 kthread+0x2a5/0x370 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x30/0x70 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Debugging Analysis: 1. The following ftrace log shows that the lockup CPU spends too much time iterating vmap_nodes and flushing TLB when purging vm_area structures. (Some info is trimmed). kworker: funcgraph_entry: | drain_vmap_area_work() { kworker: funcgraph_entry: | mutex_lock() { kworker: funcgraph_entry: 1.092 us | __cond_resched(); kworker: funcgraph_exit: 3.306 us | } ... ... kworker: funcgraph_entry: | flush_tlb_kernel_range() { ... ... kworker: funcgraph_exit: # 7533.649 us | } ... ... kworker: funcgraph_entry: 2.344 us | mutex_unlock(); kworker: funcgraph_exit: $ 23871554 us | } The drain_vmap_area_work() spends over 23 seconds. There are 2805 flush_tlb_kernel_range() calls in the ftrace log. * One is called in __purge_vmap_area_lazy(). * Others are called by purge_vmap_node->kasan_release_vmalloc. purge_vmap_node() iteratively releases kasan vmalloc allocations and flushes TLB for each vmap_area. - [Rough calculation] Each flush_tlb_kernel_range() runs about 7.5ms. -- 2804 * 7.5ms = 21.03 seconds. -- That's why a soft lock is triggered. 2. Extending the soft lockup time can work around the issue (For example, # echo 60 > /proc/sys/kernel/watchdog_thresh). This confirms the above-mentioned speculation: drain_vmap_area_work() spends too much time. If we combine all TLB flush operations of the KASAN shadow virtual address into one operation in the call path 'purge_vmap_node()->kasan_release_vmalloc()', the running time of drain_vmap_area_work() can be saved greatly. The idea is from the flush_tlb_kernel_range() call in __purge_vmap_area_lazy(). And, the soft lockup won't be triggered. Here is the test result based on 6.10: [6.10 wo/ the patch] 1. ftrace latency profiling (record a trace if the latency > 20s). echo 20000000 > /sys/kernel/debug/tracing/tracing_thresh echo drain_vmap_area_work > /sys/kernel/debug/tracing/set_graph_function echo function_graph > /sys/kernel/debug/tracing/current_tracer echo 1 > /sys/kernel/debug/tracing/tracing_on 2. Run `make -j $(nproc)` to compile the kernel source 3. Once the soft lockup is reproduced, check the ftrace log: cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 76) $ 50412985 us | } /* __purge_vmap_area_lazy */ 76) $ 50412997 us | } /* drain_vmap_area_work */ 76) $ 29165911 us | } /* __purge_vmap_area_lazy */ 76) $ 29165926 us | } /* drain_vmap_area_work */ 91) $ 53629423 us | } /* __purge_vmap_area_lazy */ 91) $ 53629434 us | } /* drain_vmap_area_work */ 91) $ 28121014 us | } /* __purge_vmap_area_lazy */ 91) $ 28121026 us | } /* drain_vmap_area_work */ [6.10 w/ the patch] 1. Repeat step 1-2 in "[6.10 wo/ the patch]" 2. The soft lockup is not triggered and ftrace log is empty. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 3. Setting 'tracing_thresh' to 10/5 seconds does not get any ftrace log. 4. Setting 'tracing_thresh' to 1 second gets ftrace log. cat /sys/kernel/debug/tracing/trace # tracer: function_graph # # CPU DURATION FUNCTION CALLS # | | | | | | | 23) $ 1074942 us | } /* __purge_vmap_area_lazy */ 23) $ 1074950 us | } /* drain_vmap_area_work */ The worst execution time of drain_vmap_area_work() is about 1 second. Link: https://lore.kernel.org/lkml/ZqFlawuVnOMY2k3E@pc638.lan/ Link: https://lkml.kernel.org/r/20240726165246.31326-1-ahuang12@lenovo.com Fixes: 282631cb2447 ("mm: vmalloc: remove global purge_vmap_area_root rb-tree") Signed-off-by: Adrian Huang Co-developed-by: Uladzislau Rezki (Sony) Signed-off-by: Uladzislau Rezki (Sony) Tested-by: Jiwei Sun Reviewed-by: Baoquan He Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Christoph Hellwig Cc: Dmitry Vyukov Cc: Vincenzo Frascino Cc: Signed-off-by: Andrew Morton --- include/linux/kasan.h | 12 +++++++++--- mm/kasan/shadow.c | 14 ++++++++++---- mm/vmalloc.c | 34 ++++++++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 15 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 00a3bf7c0d8f0..6bbfc8aa42e8f 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -29,6 +29,9 @@ typedef unsigned int __bitwise kasan_vmalloc_flags_t; #define KASAN_VMALLOC_VM_ALLOC ((__force kasan_vmalloc_flags_t)0x02u) #define KASAN_VMALLOC_PROT_NORMAL ((__force kasan_vmalloc_flags_t)0x04u) +#define KASAN_VMALLOC_PAGE_RANGE 0x1 /* Apply exsiting page range */ +#define KASAN_VMALLOC_TLB_FLUSH 0x2 /* TLB flush */ + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #include @@ -564,7 +567,8 @@ void kasan_populate_early_vm_area_shadow(void *start, unsigned long size); int kasan_populate_vmalloc(unsigned long addr, unsigned long size); void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end); + unsigned long free_region_end, + unsigned long flags); #else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -579,7 +583,8 @@ static inline int kasan_populate_vmalloc(unsigned long start, static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) { } + unsigned long free_region_end, + unsigned long flags) { } #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ @@ -614,7 +619,8 @@ static inline int kasan_populate_vmalloc(unsigned long start, static inline void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) { } + unsigned long free_region_end, + unsigned long flags) { } static inline void *kasan_unpoison_vmalloc(const void *start, unsigned long size, diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c index d6210ca48ddab..88d1c9dcb5072 100644 --- a/mm/kasan/shadow.c +++ b/mm/kasan/shadow.c @@ -489,7 +489,8 @@ static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr, */ void kasan_release_vmalloc(unsigned long start, unsigned long end, unsigned long free_region_start, - unsigned long free_region_end) + unsigned long free_region_end, + unsigned long flags) { void *shadow_start, *shadow_end; unsigned long region_start, region_end; @@ -522,12 +523,17 @@ void kasan_release_vmalloc(unsigned long start, unsigned long end, __memset(shadow_start, KASAN_SHADOW_INIT, shadow_end - shadow_start); return; } - apply_to_existing_page_range(&init_mm, + + + if (flags & KASAN_VMALLOC_PAGE_RANGE) + apply_to_existing_page_range(&init_mm, (unsigned long)shadow_start, size, kasan_depopulate_vmalloc_pte, NULL); - flush_tlb_kernel_range((unsigned long)shadow_start, - (unsigned long)shadow_end); + + if (flags & KASAN_VMALLOC_TLB_FLUSH) + flush_tlb_kernel_range((unsigned long)shadow_start, + (unsigned long)shadow_end); } } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 634162271c004..5480b77f4167d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2182,6 +2182,25 @@ decay_va_pool_node(struct vmap_node *vn, bool full_decay) reclaim_list_global(&decay_list); } +static void +kasan_release_vmalloc_node(struct vmap_node *vn) +{ + struct vmap_area *va; + unsigned long start, end; + + start = list_first_entry(&vn->purge_list, struct vmap_area, list)->va_start; + end = list_last_entry(&vn->purge_list, struct vmap_area, list)->va_end; + + list_for_each_entry(va, &vn->purge_list, list) { + if (is_vmalloc_or_module_addr((void *) va->va_start)) + kasan_release_vmalloc(va->va_start, va->va_end, + va->va_start, va->va_end, + KASAN_VMALLOC_PAGE_RANGE); + } + + kasan_release_vmalloc(start, end, start, end, KASAN_VMALLOC_TLB_FLUSH); +} + static void purge_vmap_node(struct work_struct *work) { struct vmap_node *vn = container_of(work, @@ -2190,20 +2209,17 @@ static void purge_vmap_node(struct work_struct *work) struct vmap_area *va, *n_va; LIST_HEAD(local_list); + if (IS_ENABLED(CONFIG_KASAN_VMALLOC)) + kasan_release_vmalloc_node(vn); + vn->nr_purged = 0; list_for_each_entry_safe(va, n_va, &vn->purge_list, list) { unsigned long nr = va_size(va) >> PAGE_SHIFT; - unsigned long orig_start = va->va_start; - unsigned long orig_end = va->va_end; unsigned int vn_id = decode_vn_id(va->flags); list_del_init(&va->list); - if (is_vmalloc_or_module_addr((void *)orig_start)) - kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); - nr_purged_pages += nr; vn->nr_purged++; @@ -4784,7 +4800,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + va->va_start, va->va_end, + KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; } @@ -4834,7 +4851,8 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, &free_vmap_area_list); if (va) kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + va->va_start, va->va_end, + KASAN_VMALLOC_PAGE_RANGE | KASAN_VMALLOC_TLB_FLUSH); vas[area] = NULL; kfree(vms[area]); } From bf779fb9afb5c5cc3c45d19a7a1ea7cd77c742f0 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:06 +0900 Subject: [PATCH 006/215] zram: introduce ZRAM_PP_SLOT flag Patch series "zram: optimal post-processing target selection", v5. Problem: -------- Both recompression and writeback perform a very simple linear scan of all zram slots in search for post-processing (writeback or recompress) candidate slots. This often means that we pick the worst candidate for pp (post-processing), e.g. a 48 bytes object for writeback, which is nearly useless, because it only releases 48 bytes from zsmalloc pool, but consumes an entire 4K slot in the backing device. Similarly, recompression of an 48 bytes objects is unlikely to save more memory that recompression of a 3000 bytes object. Both recompression and writeback consume constrained resources (CPU time, batter, backing device storage space) and quite often have a (daily) limit on the number of items they post-process, so we should utilize those constrained resources in the most optimal way. Solution: --------- This patch reworks the way we select pp targets. We, quite clearly, want to sort all the candidates and always pick the largest, be it recompression or writeback. Especially for writeback, because the larger object we writeback the more memory we release. This series introduces concept of pp buckets and pp scan/selection. The scan step is a simple iteration over all zram->table entries, just like what we currently do, but we don't post-process a candidate slot immediately. Instead we assign it to a PP (post-processing) bucket. PP bucket is, basically, a list which holds pp candidate slots that belong to the same size class. PP buckets are 64 bytes apart, slots are not strictly sorted within a bucket there is a 64 bytes variance. The select step simply iterates over pp buckets from highest to lowest and picks all candidate slots a particular buckets contains. So this gives us sorted candidates (in linear time) and allows us to select most optimal (largest) candidates for post-processing first. This patch (of 7): This flag indicates that the slot was selected as a candidate slot for post-processing (pp) and was assigned to a pp bucket. It does not necessarily mean that the slot is currently under post-processing, but may mean so. The slot can loose its PP_SLOT flag, while still being in the pp-bucket, if it's accessed or slot_free-ed. Link: https://lkml.kernel.org/r/20240917021020.883356-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20240917021020.883356-2-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index ad9c9bc3ccfc5..d61750c1c5b5c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -178,6 +178,7 @@ static inline u32 zram_get_priority(struct zram *zram, u32 index) static void zram_accessed(struct zram *zram, u32 index) { zram_clear_flag(zram, index, ZRAM_IDLE); + zram_clear_flag(zram, index, ZRAM_PP_SLOT); #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME zram->table[index].ac_time = ktime_get_boottime(); #endif @@ -1354,6 +1355,7 @@ static void zram_free_page(struct zram *zram, size_t index) zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); zram_set_priority(zram, index, 0); + zram_clear_flag(zram, index, ZRAM_PP_SLOT); if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index cfc8c059db636..914cb66299694 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -48,6 +48,7 @@ enum zram_pageflags { ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ ZRAM_UNDER_WB, /* page is under writeback */ + ZRAM_PP_SLOT, /* Selected for post-processing */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ ZRAM_INCOMPRESSIBLE, /* none of the algorithms could compress it */ From 58652f2b6d21f2874c9f060165ec7e03e8b1fc71 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:07 +0900 Subject: [PATCH 007/215] zram: permit only one post-processing operation at a time Both recompress and writeback soon will unlock slots during processing, which makes things too complex wrt possible race-conditions. We still want to clear PP_SLOT in slot_free, because this is how we figure out that slot that was selected for post-processing has been released under us and when we start post-processing we check if slot still has PP_SLOT set. At the same time, theoretically, we can have something like this: CPU0 CPU1 recompress scan slots set PP_SLOT unlock slot slot_free clear PP_SLOT allocate PP_SLOT writeback scan slots set PP_SLOT unlock slot select PP-slot test PP_SLOT So recompress will not detect that slot has been re-used and re-selected for concurrent writeback post-processing. Make sure that we only permit on post-processing operation at a time. So now recompress and writeback post-processing don't race against each other, we only need to handle slot re-use (slot_free and write), which is handled individually by each pp operation. Having recompress and writeback competing for the same slots is not exactly good anyway (can't imagine anyone doing that). Link: https://lkml.kernel.org/r/20240917021020.883356-3-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/admin-guide/blockdev/zram.rst | 2 ++ drivers/block/zram/zram_drv.c | 16 ++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 3 files changed, 19 insertions(+) diff --git a/Documentation/admin-guide/blockdev/zram.rst b/Documentation/admin-guide/blockdev/zram.rst index 678d70d6e1c3a..714a5171bfc0b 100644 --- a/Documentation/admin-guide/blockdev/zram.rst +++ b/Documentation/admin-guide/blockdev/zram.rst @@ -47,6 +47,8 @@ The list of possible return codes: -ENOMEM zram was not able to allocate enough memory to fulfil your needs. -EINVAL invalid input has been provided. +-EAGAIN re-try operation later (e.g. when attempting to run recompress + and writeback simultaneously). ======== ============================================================= If you use 'echo', the returned value is set by the 'echo' utility, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index d61750c1c5b5c..37a284f709ba9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -627,6 +627,12 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + if (!zram->backing_dev) { ret = -ENODEV; goto release_init_lock; @@ -753,6 +759,7 @@ static ssize_t writeback_store(struct device *dev, free_block_bdev(zram, blk_idx); __free_page(page); release_init_lock: + atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); return ret; @@ -1883,6 +1890,12 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } + /* Do not permit concurrent post-processing actions. */ + if (atomic_xchg(&zram->pp_in_progress, 1)) { + up_read(&zram->init_lock); + return -EAGAIN; + } + if (algo) { bool found = false; @@ -1950,6 +1963,7 @@ static ssize_t recompress_store(struct device *dev, __free_page(page); release_init_lock: + atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); return ret; } @@ -2146,6 +2160,7 @@ static void zram_reset_device(struct zram *zram) zram->disksize = 0; zram_destroy_comps(zram); memset(&zram->stats, 0, sizeof(zram->stats)); + atomic_set(&zram->pp_in_progress, 0); reset_bdev(zram); comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); @@ -2383,6 +2398,7 @@ static int zram_add(void) zram->disk->fops = &zram_devops; zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); + atomic_set(&zram->pp_in_progress, 0); /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 914cb66299694..73a9d47d76bae 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -140,5 +140,6 @@ struct zram { #ifdef CONFIG_ZRAM_MEMORY_TRACKING struct dentry *debugfs_dir; #endif + atomic_t pp_in_progress; }; #endif From 3f909a60cec19509f6bfa01f90ad878e410cec51 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:08 +0900 Subject: [PATCH 008/215] zram: rework recompress target selection strategy Target slot selection for recompression is just a simple iteration over zram->table entries (stored pages) from slot 0 to max slot. Given that zram->table slots are written in random order and are not sorted by size, a simple iteration over slots selects suboptimal targets for recompression. This is not a problem if we recompress every single zram->table slot, but we never do that in reality. In reality we limit the number of slots we can recompress (via max_pages parameter) and hence proper slot selection becomes very important. The strategy is quite simple, suppose we have two candidate slots for recompression, one of size 48 bytes and one of size 2800 bytes, and we can recompress only one, then it certainly makes more sense to pick 2800 entry for recompression. Because even if we manage to compress 48 bytes objects even further the savings are going to be very small. Potential savings after good re-compression of 2800 bytes objects are much higher. This patch reworks slot selection and introduces the strategy described above: among candidate slots always select the biggest ones first. For that the patch introduces zram_pp_ctl (post-processing) structure which holds NUM_PP_BUCKETS pp buckets of slots. Slots are assigned to a particular group based on their sizes - the larger the size of the slot the higher the group index. This, basically, sorts slots by size in liner time (we still perform just one iteration over zram->table slots). When we select slot for recompression we always first lookup in higher pp buckets (those that hold the largest slots). Which achieves the desired behavior. TEST ==== A very simple demonstration: zram is configured with zstd, and zstd with dict as a recompression stream. A limited (max 4096 pages) recompression is performed then, with a log of sizes of slots that were recompressed. You can see that patched zram selects slots for recompression in significantly different manner, which leads to higher memory savings (see column #2 of mm_stat output). BASE ---- *** initial state of zram device /sys/block/zram0/mm_stat 1750994944 504491413 514203648 0 514203648 1 0 34204 34204 *** recompress idle max_pages=4096 /sys/block/zram0/mm_stat 1750994944 504262229 514953216 0 514203648 1 0 34204 34204 Sizes of selected objects for recompression: ... 45 58 24 226 91 40 24 24 24 424 2104 93 2078 2078 2078 959 154 ... PATCHED ------- *** initial state of zram device /sys/block/zram0/mm_stat 1750982656 504492801 514170880 0 514170880 1 0 34204 34204 *** recompress idle max_pages=4096 /sys/block/zram0/mm_stat 1750982656 503716710 517586944 0 514170880 1 0 34204 34204 Sizes of selected objects for recompression: ... 3680 3694 3667 3590 3614 3553 3537 3548 3550 3542 3543 3537 ... Note, pp-slots are not strictly sorted, there is a PP_BUCKET_SIZE_RANGE variation of sizes within particular bucket. [senozhatsky@chromium.org: do not skip the first bucket] Link: https://lkml.kernel.org/r/20241001085634.1948384-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20240917021020.883356-4-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Dan Carpenter Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 187 +++++++++++++++++++++++++++++----- 1 file changed, 160 insertions(+), 27 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 37a284f709ba9..f57ffb9201667 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -184,6 +184,99 @@ static void zram_accessed(struct zram *zram, u32 index) #endif } +#ifdef CONFIG_ZRAM_MULTI_COMP +struct zram_pp_slot { + unsigned long index; + struct list_head entry; +}; + +/* + * A post-processing bucket is, essentially, a size class, this defines + * the range (in bytes) of pp-slots sizes in particular bucket. + */ +#define PP_BUCKET_SIZE_RANGE 64 +#define NUM_PP_BUCKETS ((PAGE_SIZE / PP_BUCKET_SIZE_RANGE) + 1) + +struct zram_pp_ctl { + struct list_head pp_buckets[NUM_PP_BUCKETS]; +}; + +static struct zram_pp_ctl *init_pp_ctl(void) +{ + struct zram_pp_ctl *ctl; + u32 idx; + + ctl = kmalloc(sizeof(*ctl), GFP_KERNEL); + if (!ctl) + return NULL; + + for (idx = 0; idx < NUM_PP_BUCKETS; idx++) + INIT_LIST_HEAD(&ctl->pp_buckets[idx]); + return ctl; +} + +static void release_pp_slot(struct zram *zram, struct zram_pp_slot *pps) +{ + list_del_init(&pps->entry); + + zram_slot_lock(zram, pps->index); + zram_clear_flag(zram, pps->index, ZRAM_PP_SLOT); + zram_slot_unlock(zram, pps->index); + + kfree(pps); +} + +static void release_pp_ctl(struct zram *zram, struct zram_pp_ctl *ctl) +{ + u32 idx; + + if (!ctl) + return; + + for (idx = 0; idx < NUM_PP_BUCKETS; idx++) { + while (!list_empty(&ctl->pp_buckets[idx])) { + struct zram_pp_slot *pps; + + pps = list_first_entry(&ctl->pp_buckets[idx], + struct zram_pp_slot, + entry); + release_pp_slot(zram, pps); + } + } + + kfree(ctl); +} + +static void place_pp_slot(struct zram *zram, struct zram_pp_ctl *ctl, + struct zram_pp_slot *pps) +{ + u32 idx; + + idx = zram_get_obj_size(zram, pps->index) / PP_BUCKET_SIZE_RANGE; + list_add(&pps->entry, &ctl->pp_buckets[idx]); + + zram_set_flag(zram, pps->index, ZRAM_PP_SLOT); +} + +static struct zram_pp_slot *select_pp_slot(struct zram_pp_ctl *ctl) +{ + struct zram_pp_slot *pps = NULL; + s32 idx = NUM_PP_BUCKETS - 1; + + /* The higher the bucket id the more optimal slot post-processing is */ + while (idx >= 0) { + pps = list_first_entry_or_null(&ctl->pp_buckets[idx], + struct zram_pp_slot, + entry); + if (pps) + break; + + idx--; + } + return pps; +} +#endif + static inline void update_used_max(struct zram *zram, const unsigned long pages) { @@ -1657,6 +1750,52 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } #ifdef CONFIG_ZRAM_MULTI_COMP +#define RECOMPRESS_IDLE (1 << 0) +#define RECOMPRESS_HUGE (1 << 1) + +static int scan_slots_for_recompress(struct zram *zram, u32 mode, + struct zram_pp_ctl *ctl) +{ + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct zram_pp_slot *pps = NULL; + unsigned long index; + + for (index = 0; index < nr_pages; index++) { + if (!pps) + pps = kmalloc(sizeof(*pps), GFP_KERNEL); + if (!pps) + return -ENOMEM; + + INIT_LIST_HEAD(&pps->entry); + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (mode & RECOMPRESS_IDLE && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + + if (mode & RECOMPRESS_HUGE && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME) || + zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + pps->index = index; + place_pp_slot(zram, ctl, pps); + pps = NULL; +next: + zram_slot_unlock(zram, index); + } + + kfree(pps); + return 0; +} + /* * This function will decompress (unless it's ZRAM_HUGE) the page and then * attempt to compress it using provided compression algorithm priority @@ -1664,7 +1803,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, * * Corresponding ZRAM slot should be locked. */ -static int zram_recompress(struct zram *zram, u32 index, struct page *page, +static int recompress_slot(struct zram *zram, u32 index, struct page *page, u64 *num_recomp_pages, u32 threshold, u32 prio, u32 prio_max) { @@ -1807,20 +1946,17 @@ static int zram_recompress(struct zram *zram, u32 index, struct page *page, return 0; } -#define RECOMPRESS_IDLE (1 << 0) -#define RECOMPRESS_HUGE (1 << 1) - static ssize_t recompress_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { u32 prio = ZRAM_SECONDARY_COMP, prio_max = ZRAM_MAX_COMPS; struct zram *zram = dev_to_zram(dev); - unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; char *args, *param, *val, *algo = NULL; u64 num_recomp_pages = ULLONG_MAX; + struct zram_pp_ctl *ctl = NULL; + struct zram_pp_slot *pps; u32 mode = 0, threshold = 0; - unsigned long index; struct page *page; ssize_t ret; @@ -1922,36 +2058,32 @@ static ssize_t recompress_store(struct device *dev, goto release_init_lock; } + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + scan_slots_for_recompress(zram, mode, ctl); + ret = len; - for (index = 0; index < nr_pages; index++) { + while ((pps = select_pp_slot(ctl))) { int err = 0; if (!num_recomp_pages) break; - zram_slot_lock(zram, index); - - if (!zram_allocated(zram, index)) - goto next; - - if (mode & RECOMPRESS_IDLE && - !zram_test_flag(zram, index, ZRAM_IDLE)) + zram_slot_lock(zram, pps->index); + if (!zram_test_flag(zram, pps->index, ZRAM_PP_SLOT)) goto next; - if (mode & RECOMPRESS_HUGE && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_UNDER_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - goto next; - - err = zram_recompress(zram, index, page, &num_recomp_pages, - threshold, prio, prio_max); + err = recompress_slot(zram, pps->index, page, + &num_recomp_pages, threshold, + prio, prio_max); next: - zram_slot_unlock(zram, index); + zram_slot_unlock(zram, pps->index); + release_pp_slot(zram, pps); + if (err) { ret = err; break; @@ -1963,6 +2095,7 @@ static ssize_t recompress_store(struct device *dev, __free_page(page); release_init_lock: + release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); return ret; From 330edc2bc059a48b1f61a704521818d4f831767c Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:09 +0900 Subject: [PATCH 009/215] zram: rework writeback target selection strategy Writeback suffers from the same problem as recompression did before - target slot selection for writeback is just a simple iteration over zram->table entries (stored pages) which selects suboptimal targets for writeback. This is especially problematic for writeback, because we uncompress objects before writeback so each of them takes 4K out of limited writeback storage. For example, when we take a 48 bytes slot and store it as a 4K object to writeback device we only save 48 bytes of memory (release from zsmalloc pool). We naturally want to pick the largest objects for writeback, because then each writeback will release the largest amount of memory. This patch applies the same solution and strategy as for recompression target selection: pp control (post-process) with 16 buckets of candidate pp slots. Slots are assigned to pp buckets based on sizes - the larger the slot the higher the group index. This gives us sorted by size lists of candidate slots (in linear time), so that among post-processing candidate slots we always select the largest ones first and maximize the memory saving. TEST ==== A very simple demonstration: zram is configured with a writeback device. A limited writeback (wb_limit 2500 pages) is performed then, with a log of sizes of slots that were written back. You can see that patched zram selects slots for recompression in significantly different manner, which leads to higher memory savings (see column #2 of mm_stat output). BASE ---- *** initial state of zram device /sys/block/zram0/mm_stat 1750327296 619765836 631902208 0 631902208 1 0 34278 34278 *** writeback idle wb_limit 2500 /sys/block/zram0/mm_stat 1750327296 617622333 631578624 0 631902208 1 0 34278 34278 Sizes of selected objects for writeback: ... 193 349 46 46 46 46 852 1002 543 162 107 49 34 34 34 ... PATCHED ------- *** initial state of zram device /sys/block/zram0/mm_stat 1750319104 619760957 631992320 0 631992320 1 0 34278 34278 *** writeback idle wb_limit 2500 /sys/block/zram0/mm_stat 1750319104 612672056 626135040 0 631992320 1 0 34278 34278 Sizes of selected objects for writeback: ... 3667 3580 3581 3580 3581 3581 3581 3231 3211 3203 3231 3246 ... Note, pp-slots are not strictly sorted, there is a PP_BUCKET_SIZE_RANGE variation of sizes within particular bucket. Link: https://lkml.kernel.org/r/20240917021020.883356-5-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 83 +++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 19 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f57ffb9201667..42f7195b80cb9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -184,7 +184,7 @@ static void zram_accessed(struct zram *zram, u32 index) #endif } -#ifdef CONFIG_ZRAM_MULTI_COMP +#if defined CONFIG_ZRAM_WRITEBACK || defined CONFIG_ZRAM_MULTI_COMP struct zram_pp_slot { unsigned long index; struct list_head entry; @@ -681,11 +681,57 @@ static void read_from_bdev_async(struct zram *zram, struct page *page, #define IDLE_WRITEBACK (1<<1) #define INCOMPRESSIBLE_WRITEBACK (1<<2) +static int scan_slots_for_writeback(struct zram *zram, u32 mode, + unsigned long nr_pages, + unsigned long index, + struct zram_pp_ctl *ctl) +{ + struct zram_pp_slot *pps = NULL; + + for (; nr_pages != 0; index++, nr_pages--) { + if (!pps) + pps = kmalloc(sizeof(*pps), GFP_KERNEL); + if (!pps) + return -ENOMEM; + + INIT_LIST_HEAD(&pps->entry); + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + if (zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) + goto next; + + if (mode & IDLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_IDLE)) + goto next; + if (mode & HUGE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_HUGE)) + goto next; + if (mode & INCOMPRESSIBLE_WRITEBACK && + !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + goto next; + + pps->index = index; + place_pp_slot(zram, ctl, pps); + pps = NULL; +next: + zram_slot_unlock(zram, index); + } + + kfree(pps); + return 0; +} + static ssize_t writeback_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct zram_pp_ctl *ctl = NULL; + struct zram_pp_slot *pps; unsigned long index = 0; struct bio bio; struct bio_vec bio_vec; @@ -737,7 +783,15 @@ static ssize_t writeback_store(struct device *dev, goto release_init_lock; } - for (; nr_pages != 0; index++, nr_pages--) { + ctl = init_pp_ctl(); + if (!ctl) { + ret = -ENOMEM; + goto release_init_lock; + } + + scan_slots_for_writeback(zram, mode, nr_pages, index, ctl); + + while ((pps = select_pp_slot(ctl))) { spin_lock(&zram->wb_limit_lock); if (zram->wb_limit_enable && !zram->bd_wb_limit) { spin_unlock(&zram->wb_limit_lock); @@ -754,25 +808,10 @@ static ssize_t writeback_store(struct device *dev, } } + index = pps->index; zram_slot_lock(zram, index); - if (!zram_allocated(zram, index)) - goto next; - - if (zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_SAME) || - zram_test_flag(zram, index, ZRAM_UNDER_WB)) - goto next; - - if (mode & IDLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_IDLE)) - goto next; - if (mode & HUGE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_HUGE)) - goto next; - if (mode & INCOMPRESSIBLE_WRITEBACK && - !zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - /* * Clearing ZRAM_UNDER_WB is duty of caller. * IOW, zram_free_page never clear it. @@ -786,6 +825,8 @@ static ssize_t writeback_store(struct device *dev, zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); + + release_pp_slot(zram, pps); continue; } @@ -804,6 +845,8 @@ static ssize_t writeback_store(struct device *dev, zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); + + release_pp_slot(zram, pps); /* * BIO errors are not fatal, we continue and simply * attempt to writeback the remaining objects (pages). @@ -846,12 +889,14 @@ static ssize_t writeback_store(struct device *dev, spin_unlock(&zram->wb_limit_lock); next: zram_slot_unlock(zram, index); + release_pp_slot(zram, pps); } if (blk_idx) free_block_bdev(zram, blk_idx); __free_page(page); release_init_lock: + release_pp_ctl(zram, ctl); atomic_set(&zram->pp_in_progress, 0); up_read(&zram->init_lock); From b967fa1ba72b5da2b6d9bf95f0b13420a59e0701 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:10 +0900 Subject: [PATCH 010/215] zram: do not mark idle slots that cannot be idle ZRAM_SAME slots cannot be post-processed (writeback or recompress) so do not mark them ZRAM_IDLE. Same with ZRAM_WB slots, they cannot be ZRAM_IDLE because they are not in zsmalloc pool anymore. Link: https://lkml.kernel.org/r/20240917021020.883356-6-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 42f7195b80cb9..41e4086619405 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -392,17 +392,28 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) /* * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. * See the comment in writeback_store. + * + * Also do not mark ZRAM_SAME slots as ZRAM_IDLE, because no + * post-processing (recompress, writeback) happens to the + * ZRAM_SAME slot. + * + * And ZRAM_WB slots simply cannot be ZRAM_IDLE. */ zram_slot_lock(zram, index); - if (zram_allocated(zram, index) && - !zram_test_flag(zram, index, ZRAM_UNDER_WB)) { + if (!zram_allocated(zram, index) || + zram_test_flag(zram, index, ZRAM_WB) || + zram_test_flag(zram, index, ZRAM_UNDER_WB) || + zram_test_flag(zram, index, ZRAM_SAME)) { + zram_slot_unlock(zram, index); + continue; + } + #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME - is_idle = !cutoff || ktime_after(cutoff, - zram->table[index].ac_time); + is_idle = !cutoff || + ktime_after(cutoff, zram->table[index].ac_time); #endif - if (is_idle) - zram_set_flag(zram, index, ZRAM_IDLE); - } + if (is_idle) + zram_set_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); } } From 1a1d0f8992d5c6c8059d28cd9cb263180dd98a28 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:11 +0900 Subject: [PATCH 011/215] zram: reshuffle zram_free_page() flags operations Drop some redundant zram_test_flag() calls and re-order zram_clear_flag() calls. Plus two small trivial coding style fixes. No functional changes. Link: https://lkml.kernel.org/r/20240917021020.883356-7-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 41e4086619405..c59a3e9218a9c 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1499,20 +1499,17 @@ static void zram_free_page(struct zram *zram, size_t index) #ifdef CONFIG_ZRAM_TRACK_ENTRY_ACTIME zram->table[index].ac_time = 0; #endif - if (zram_test_flag(zram, index, ZRAM_IDLE)) - zram_clear_flag(zram, index, ZRAM_IDLE); + + zram_clear_flag(zram, index, ZRAM_IDLE); + zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); + zram_clear_flag(zram, index, ZRAM_PP_SLOT); + zram_set_priority(zram, index, 0); if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); } - if (zram_test_flag(zram, index, ZRAM_INCOMPRESSIBLE)) - zram_clear_flag(zram, index, ZRAM_INCOMPRESSIBLE); - - zram_set_priority(zram, index, 0); - zram_clear_flag(zram, index, ZRAM_PP_SLOT); - if (zram_test_flag(zram, index, ZRAM_WB)) { zram_clear_flag(zram, index, ZRAM_WB); free_block_bdev(zram, zram_get_element(zram, index)); @@ -1536,13 +1533,12 @@ static void zram_free_page(struct zram *zram, size_t index) zs_free(zram->mem_pool, handle); atomic64_sub(zram_get_obj_size(zram, index), - &zram->stats.compr_data_size); + &zram->stats.compr_data_size); out: atomic64_dec(&zram->stats.pages_stored); zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); - WARN_ON_ONCE(zram->table[index].flags & - ~(1UL << ZRAM_UNDER_WB)); + WARN_ON_ONCE(zram->table[index].flags & ~(1UL << ZRAM_UNDER_WB)); } /* From 5e99893444a0e0582feb49d618195114b6e35760 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 17 Sep 2024 11:09:12 +0900 Subject: [PATCH 012/215] zram: remove UNDER_WB and simplify writeback We now have only one active post-processing at any time, so we don't have same race conditions that we had before. If slot selected for post-processing gets freed or freed and reallocated it loses its PP_SLOT flag and there is no way for such a slot to gain PP_SLOT flag again until current post-processing terminates. Link: https://lkml.kernel.org/r/20240917021020.883356-8-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 53 +++++++++++------------------------ drivers/block/zram/zram_drv.h | 1 - 2 files changed, 16 insertions(+), 38 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index c59a3e9218a9c..263795c4aef70 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -390,10 +390,7 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) for (index = 0; index < nr_pages; index++) { /* - * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race. - * See the comment in writeback_store. - * - * Also do not mark ZRAM_SAME slots as ZRAM_IDLE, because no + * Do not mark ZRAM_SAME slots as ZRAM_IDLE, because no * post-processing (recompress, writeback) happens to the * ZRAM_SAME slot. * @@ -402,7 +399,6 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) zram_slot_lock(zram, index); if (!zram_allocated(zram, index) || zram_test_flag(zram, index, ZRAM_WB) || - zram_test_flag(zram, index, ZRAM_UNDER_WB) || zram_test_flag(zram, index, ZRAM_SAME)) { zram_slot_unlock(zram, index); continue; @@ -821,22 +817,17 @@ static ssize_t writeback_store(struct device *dev, index = pps->index; zram_slot_lock(zram, index); - if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) - goto next; /* - * Clearing ZRAM_UNDER_WB is duty of caller. - * IOW, zram_free_page never clear it. + * scan_slots() sets ZRAM_PP_SLOT and relases slot lock, so + * slots can change in the meantime. If slots are accessed or + * freed they lose ZRAM_PP_SLOT flag and hence we don't + * post-process them. */ - zram_set_flag(zram, index, ZRAM_UNDER_WB); - /* Need for hugepage writeback racing */ - zram_set_flag(zram, index, ZRAM_IDLE); + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) + goto next; zram_slot_unlock(zram, index); - if (zram_read_page(zram, page, index, NULL)) { - zram_slot_lock(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); + if (zram_read_page(zram, page, index, NULL)) { release_pp_slot(zram, pps); continue; } @@ -852,11 +843,6 @@ static ssize_t writeback_store(struct device *dev, */ err = submit_bio_wait(&bio); if (err) { - zram_slot_lock(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); - zram_slot_unlock(zram, index); - release_pp_slot(zram, pps); /* * BIO errors are not fatal, we continue and simply @@ -871,25 +857,19 @@ static ssize_t writeback_store(struct device *dev, } atomic64_inc(&zram->stats.bd_writes); + zram_slot_lock(zram, index); /* - * We released zram_slot_lock so need to check if the slot was - * changed. If there is freeing for the slot, we can catch it - * easily by zram_allocated. - * A subtle case is the slot is freed/reallocated/marked as - * ZRAM_IDLE again. To close the race, idle_store doesn't - * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB. - * Thus, we could close the race by checking ZRAM_IDLE bit. + * Same as above, we release slot lock during writeback so + * slot can change under us: slot_free() or slot_free() and + * reallocation (zram_write_page()). In both cases slot loses + * ZRAM_PP_SLOT flag. No concurrent post-processing can set + * ZRAM_PP_SLOT on such slots until current post-processing + * finishes. */ - zram_slot_lock(zram, index); - if (!zram_allocated(zram, index) || - !zram_test_flag(zram, index, ZRAM_IDLE)) { - zram_clear_flag(zram, index, ZRAM_UNDER_WB); - zram_clear_flag(zram, index, ZRAM_IDLE); + if (!zram_test_flag(zram, index, ZRAM_PP_SLOT)) goto next; - } zram_free_page(zram, index); - zram_clear_flag(zram, index, ZRAM_UNDER_WB); zram_set_flag(zram, index, ZRAM_WB); zram_set_element(zram, index, blk_idx); blk_idx = 0; @@ -1538,7 +1518,6 @@ static void zram_free_page(struct zram *zram, size_t index) atomic64_dec(&zram->stats.pages_stored); zram_set_handle(zram, index, 0); zram_set_obj_size(zram, index, 0); - WARN_ON_ONCE(zram->table[index].flags & ~(1UL << ZRAM_UNDER_WB)); } /* diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 73a9d47d76bae..134be414e2106 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -47,7 +47,6 @@ enum zram_pageflags { ZRAM_SAME = ZRAM_FLAG_SHIFT, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ - ZRAM_UNDER_WB, /* page is under writeback */ ZRAM_PP_SLOT, /* Selected for post-processing */ ZRAM_HUGE, /* Incompressible page */ ZRAM_IDLE, /* not accessed page since last idle marking */ From cd3f8467afd470ccab0de2fbc7c76664af4a0bac Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Tue, 24 Sep 2024 21:10:23 +0100 Subject: [PATCH 013/215] mm: refactor mm_access() to not return NULL mm_access() can return NULL if the mm is not found, but this is handled the same as an error in all callers, with some translating this into an -ESRCH error. Only proc_mem_open() returns NULL if no mm is found, however in this case it is clearer and makes more sense to explicitly handle the error. Additionally we take the opportunity to refactor the function to eliminate unnecessary nesting. Simplify things by simply returning -ESRCH if no mm is found - this both eliminates confusing use of the IS_ERR_OR_NULL() macro, and simplifies callers which would return -ESRCH by returning this error directly. [lorenzo.stoakes@oracle.com: prefer neater pointer error comparison] Link: https://lkml.kernel.org/r/2fae1834-749a-45e1-8594-5e5979cf7103@lucifer.local Link: https://lkml.kernel.org/r/20240924201023.193135-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Arnd Bergmann Cc: Al Viro Signed-off-by: Andrew Morton --- fs/proc/base.c | 26 ++++++++++++++------------ kernel/fork.c | 5 +++-- mm/madvise.c | 4 ++-- mm/process_vm_access.c | 4 ++-- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index b31283d81c52e..94112df5f2a24 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -832,19 +832,21 @@ static const struct file_operations proc_single_file_operations = { struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); - struct mm_struct *mm = ERR_PTR(-ESRCH); + struct mm_struct *mm; - if (task) { - mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); - put_task_struct(task); + if (!task) + return ERR_PTR(-ESRCH); - if (!IS_ERR_OR_NULL(mm)) { - /* ensure this mm_struct can't be freed */ - mmgrab(mm); - /* but do not pin its memory */ - mmput(mm); - } - } + mm = mm_access(task, mode | PTRACE_MODE_FSCREDS); + put_task_struct(task); + + if (IS_ERR(mm)) + return mm == ERR_PTR(-ESRCH) ? NULL : mm; + + /* ensure this mm_struct can't be freed */ + mmgrab(mm); + /* but do not pin its memory */ + mmput(mm); return mm; } @@ -2208,7 +2210,7 @@ static int map_files_d_revalidate(struct dentry *dentry, unsigned int flags) goto out_notask; mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) + if (IS_ERR(mm)) goto out; if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) { diff --git a/kernel/fork.c b/kernel/fork.c index 22f43721d031d..b2ab422f62309 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1546,8 +1546,9 @@ struct mm_struct *mm_access(struct task_struct *task, unsigned int mode) return ERR_PTR(err); mm = get_task_mm(task); - if (mm && mm != current->mm && - !ptrace_may_access(task, mode)) { + if (!mm) { + mm = ERR_PTR(-ESRCH); + } else if (mm != current->mm && !ptrace_may_access(task, mode)) { mmput(mm); mm = ERR_PTR(-EACCES); } diff --git a/mm/madvise.c b/mm/madvise.c index ff139e57cca29..50d223ab3894f 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1511,8 +1511,8 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); - if (IS_ERR_OR_NULL(mm)) { - ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + if (IS_ERR(mm)) { + ret = PTR_ERR(mm); goto release_task; } diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index b308e96cd05a2..656d3e88755b6 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -201,8 +201,8 @@ static ssize_t process_vm_rw_core(pid_t pid, struct iov_iter *iter, } mm = mm_access(task, PTRACE_MODE_ATTACH_REALCREDS); - if (!mm || IS_ERR(mm)) { - rc = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + if (IS_ERR(mm)) { + rc = PTR_ERR(mm); /* * Explicitly map EACCES to EPERM as EPERM is a more * appropriate error code for process_vw_readv/writev From 8c7904a8cd0dfb061d078545c2d3c4acce1fcfeb Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 11 Sep 2024 14:27:58 +0000 Subject: [PATCH 014/215] maple_tree: i is always less than or equal to mas_end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "refine mas_mab_cp()". By analysis of the code, one condition check can be removed and one case would hit a redundant assignment. This patch (of 2): mas_mab_cp() copy range [mas_start, mas_end] inclusively from a maple_node to maple_big_node. This implies mas_start <= mas_end. Based on the relationship of mas_start and mas_end, we can have the following four cases: | mas_start == mas_end | mas_start < mas_end ---------------+----------------------+---------------------- mas_start == 0 | 1 | 2 ---------------+----------------------+---------------------- mas_start != 0 | 3 | 4 We can see in all these four cases, i is always less than or equal to mas_end after finish the loop: Case 1: After assign pivot 0, i is set to 1, which is bigger than mas_end 0. So it jumps to complete and skip the check. Case 2: After assign pivot 0, i is set to 1. ∵ (mas_start < mas_end) && (mas_start == 0) ==> (1 <= mas_end) ∵ (i == 1) && (1 <= mas_end) ==> (i <= mas_end) ∴ Before loop, we have (i <= mas_end). And we still hold this if it skips the loop. For example, (i == mas_end). Now let's see what happens in the loop: ∵ piv_end = min(mas_end, mt_pivots[mt]) ==> (piv_end <= mas_end) ∵ loop condition is (i < piv_end) ==> (i <= piv_end) on finish the loop both normally or break ∵ (i <= piv_end) && (piv_end <= mas_end) ==> (i <= mas_end) ∴ After loop, we still get (i <= mas_end) in this case Case 3: This case would skip both if clause and loop. So when it comes to the check, i is still mas_start which equals to mas_end. Case 4: This case would skip the if clause. ∵ (mas_start < mas_end) && (i == mas_start) ==> (i < mas_end) ∴ Before loop, we have (i < mas_end). The loop process is similar with Case 2, so we get the same result. Now we can conclude in all cases, we get (i <= mas_end) when doing check. Then it is not necessary to do the check. Link: https://lkml.kernel.org/r/20240911142759.20989-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20240911142759.20989-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 3619301dda2eb..55958cbcc3faf 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1949,8 +1949,7 @@ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, goto complete; } - if (likely(i <= mas_end)) - b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); + b_node->pivot[j] = mas_safe_pivot(mas, pivots, i, mt); complete: b_node->b_end = ++j; From 1c148069b240a3a65d1aee90c9d5c6997a747a7d Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 11 Sep 2024 14:27:59 +0000 Subject: [PATCH 015/215] maple_tree: goto complete directly on a pivot of 0 When we break the loop after assigning a pivot, the index i/j is not changed. Then the following code assign pivot, which means we do the assignment with same i/j by mas_safe_pivot. Since the loop condition is (i < piv_end), from which we can get i is less than mt_pivots[mt]. It implies mas_safe_pivot() return pivot[i] which is the same value we get in loop. Now we can conclude it does a redundant assignment on a pivot of 0. Let's just go to complete to avoid it. Link: https://lkml.kernel.org/r/20240911142759.20989-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 55958cbcc3faf..de883bfb97ef7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1943,7 +1943,7 @@ static inline void mas_mab_cp(struct ma_state *mas, unsigned char mas_start, for (; i < piv_end; i++, j++) { b_node->pivot[j] = pivots[i]; if (unlikely(!b_node->pivot[j])) - break; + goto complete; if (unlikely(mas->max == b_node->pivot[j])) goto complete; From f36ba810816182953af74d176e0644e38979b723 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 8 Sep 2024 14:05:53 +0000 Subject: [PATCH 016/215] maple_tree: remove maple_big_node.parent Patch series "Reduce the space to be cleared for maple_big_node", v2. Found current code may clear maple_big_node redundantly. First we define a field parent, which is never used. After removing this, we reduce the size of memory to be cleared by memset. Then mast_fill_bnode() clears part of the structure twice, since slot and gap share some space. By clearing the whole structure, we can avoid this. This patch (of 2): The member parent of maple_big_node is never used. Let's remove it which could reduce the number of space to be cleared on memset. Link: https://lkml.kernel.org/r/20240908140554.20378-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20240908140554.20378-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 1 - 1 file changed, 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index de883bfb97ef7..04cd5ce2a33ce 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -120,7 +120,6 @@ static const unsigned char mt_min_slots[] = { #define MAPLE_BIG_NODE_GAPS (MAPLE_ARANGE64_SLOTS * 2 + 1) struct maple_big_node { - struct maple_pnode *parent; unsigned long pivot[MAPLE_BIG_NODE_SLOTS - 1]; union { struct maple_enode *slot[MAPLE_BIG_NODE_SLOTS]; From 5059aa6334fcf4b7ddd672255aec5835aecd32b6 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sun, 8 Sep 2024 14:05:54 +0000 Subject: [PATCH 017/215] maple_tree: memset maple_big_node as a whole In mast_fill_bnode(), we first clear some fields of maple_big_node and set the 'type' unconditionally before return. This means we won't leverage any information in maple_big_node and it is safe to clear the whole structure. In maple_big_node, we define slot and padding/gap in a union. And based on current definition of MAPLE_BIG_NODE_SLOTS/GAPS, padding is always less than slot and part of the gap is overlapped by slot. For example on 64bit system: MAPLE_BIG_NODE_SLOT is 34 MAPLE_BIG_NODE_GAP is 21 With this knowledge, current code may clear some space by twice. And this could be avoid by clearing the structure as a whole. Link: https://lkml.kernel.org/r/20240908140554.20378-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 04cd5ce2a33ce..c5987244ff636 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3157,10 +3157,7 @@ static inline void mast_fill_bnode(struct maple_subtree_state *mast, bool cp = true; unsigned char split; - memset(mast->bn->gap, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->gap)); - memset(mast->bn->slot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->slot)); - memset(mast->bn->pivot, 0, sizeof(unsigned long) * ARRAY_SIZE(mast->bn->pivot)); - mast->bn->b_end = 0; + memset(mast->bn, 0, sizeof(struct maple_big_node)); if (mte_is_root(mas->node)) { cp = false; From bbc251f30ef312343fec3f5c0591ce01078c2bb9 Mon Sep 17 00:00:00 2001 From: Zhiguo Jiang Date: Fri, 12 Jan 2024 09:23:52 +0800 Subject: [PATCH 018/215] mm: fix shrink nr.unqueued_dirty counter issue It is needed to ensure sc->nr.unqueued_dirty > 0, which can avoid setting PGDAT_DIRTY flag when sc->nr.unqueued_dirty and sc->nr.file_taken are both zero. Link: https://lkml.kernel.org/r/20240112012353.1387-1-justinjiang@vivo.com Signed-off-by: Zhiguo Jiang Signed-off-by: Andrew Morton --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 28ba2b06fc7dc..20dd72c98813d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5990,7 +5990,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) set_bit(PGDAT_WRITEBACK, &pgdat->flags); /* Allow kswapd to start writing pages during reclaim.*/ - if (sc->nr.unqueued_dirty == sc->nr.file_taken) + if (sc->nr.unqueued_dirty && + sc->nr.unqueued_dirty == sc->nr.file_taken) set_bit(PGDAT_DIRTY, &pgdat->flags); /* From 1cd1a4e71b61eaf8cadd15372b67ccd60a2e1a99 Mon Sep 17 00:00:00 2001 From: Tanya Agarwal Date: Fri, 27 Sep 2024 00:05:16 +0530 Subject: [PATCH 019/215] mm/mempolicy: fix comments for better documentation Fix typo in mempolicy.h and Correct the number of allowed memory policy Link: https://lkml.kernel.org/r/20240926183516.4034-2-tanyaagarwal25699@gmail.com Signed-off-by: Tanya Agarwal Reviewed-by: Shuah Khan Cc: Anup Sharma Signed-off-by: Andrew Morton --- include/linux/mempolicy.h | 2 +- mm/mempolicy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 1add16f216124..ce9885e0178ad 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -47,7 +47,7 @@ struct mempolicy { atomic_t refcnt; unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ - nodemask_t nodes; /* interleave/bind/perfer */ + nodemask_t nodes; /* interleave/bind/preferred/etc */ int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b646fab3e45e1..9e18a6fc30617 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -8,7 +8,7 @@ * NUMA policy allows the user to give hints in which node(s) memory should * be allocated. * - * Support four policies per VMA and per process: + * Support six policies per VMA and per process: * * The VMA policy has priority over the process policy for a page fault. * From 3b2faed068b9e736402f0b6f98fd68a177f619ec Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Sep 2024 17:20:43 +0200 Subject: [PATCH 020/215] selftests/mm: hugetlb_fault_after_madv: use default hugetlb page size Patch series "selftests/mm: hugetlb_fault_after_madv improvements". Mario brought to my attention that the hugetlb_fault_after_madv test is currently always skipped on s390x. Let's adjust the test to be independent of the default hugetlb page size and while at it, also improve the test output. This patch (of 2): We currently assume that the hugetlb page size is 2 MiB, which is why we mmap() a 2 MiB range. Is the default hugetlb size is larger, mmap() will fail because the range is not suitable. If the default hugetlb size is smaller (e.g., s390x), mmap() will fail because we would need more than one hugetlb page, but just asserted that we have exactly one. So let's simply use the default hugetlb page size instead of hard-coded 2 MiB, so the test isn't unconditionally skipped on architectures like s390x. Before this patch on s390x: $ ./hugetlb_fault_after_madv 1..0 # SKIP Failed to allocated huge page With this change on s390x: $ ./hugetlb_fault_after_madv While at it, make "huge_ptr" static. Link: https://lkml.kernel.org/r/20240926152044.2205129-1-david@redhat.com Link: https://lkml.kernel.org/r/20240926152044.2205129-2-david@redhat.com Signed-off-by: David Hildenbrand Reported-by: Mario Casquero Tested-by: Mario Casquero Reviewed-by: Shuah Khan Reviewed-by: Breno Leitao Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/hugetlb_fault_after_madv.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index 73b81c6323662..ff3ba675278d3 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -9,10 +9,10 @@ #include "vm_util.h" #include "../kselftest.h" -#define MMAP_SIZE (1 << 21) #define INLOOP_ITER 100 -char *huge_ptr; +static char *huge_ptr; +static size_t huge_page_size; /* Touch the memory while it is being madvised() */ void *touch(void *unused) @@ -30,7 +30,7 @@ void *madv(void *unused) usleep(rand() % 10); for (int i = 0; i < INLOOP_ITER; i++) - madvise(huge_ptr, MMAP_SIZE, MADV_DONTNEED); + madvise(huge_ptr, huge_page_size, MADV_DONTNEED); return NULL; } @@ -47,6 +47,10 @@ int main(void) srand(getpid()); + huge_page_size = default_huge_page_size(); + if (!huge_page_size) + ksft_exit_skip("Could not detect default hugetlb page size."); + free_hugepages = get_free_hugepages(); if (free_hugepages != 1) { ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", @@ -54,7 +58,7 @@ int main(void) } while (max--) { - huge_ptr = mmap(NULL, MMAP_SIZE, PROT_READ | PROT_WRITE, + huge_ptr = mmap(NULL, huge_page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0); @@ -66,7 +70,7 @@ int main(void) pthread_join(thread1, NULL); pthread_join(thread2, NULL); - munmap(huge_ptr, MMAP_SIZE); + munmap(huge_ptr, huge_page_size); } return KSFT_PASS; From f33cea94e37ce10e27b192e3c5e80ff685ac7b1f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 26 Sep 2024 17:20:44 +0200 Subject: [PATCH 021/215] selftests/mm: hugetlb_fault_after_madv: improve test output Let's improve the test output. For example, print the proper test result. Install a SIGBUS handler to catch any SIGBUS instead of crashing the test on failure. With unsuitable hugetlb page count: $ ./hugetlb_fault_after_madv TAP version 13 1..1 # [INFO] detected default hugetlb page size: 2048 KiB ok 2 # SKIP This test needs one and only one page to execute. Got 0 # Totals: pass:0 fail:0 xfail:0 xpass:0 skip:1 error:0 On a failure: $ ./hugetlb_fault_after_madv TAP version 13 1..1 not ok 1 SIGBUS behavior Bail out! 1 out of 1 tests failed On success: $ ./hugetlb_fault_after_madv TAP version 13 1..1 # [INFO] detected default hugetlb page size: 2048 KiB ok 1 SIGBUS behavior # Totals: pass:1 fail:0 xfail:0 xpass:0 skip:0 error:0 Link: https://lkml.kernel.org/r/20240926152044.2205129-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Breno Leitao Tested-by: Mario Casquero Cc: Shuah Khan Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/mm/hugetlb_fault_after_madv.c | 34 ++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c index ff3ba675278d3..e2640529dbb29 100644 --- a/tools/testing/selftests/mm/hugetlb_fault_after_madv.c +++ b/tools/testing/selftests/mm/hugetlb_fault_after_madv.c @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include "vm_util.h" #include "../kselftest.h" @@ -14,11 +16,25 @@ static char *huge_ptr; static size_t huge_page_size; +static sigjmp_buf sigbuf; +static bool sigbus_triggered; + +static void signal_handler(int signal) +{ + if (signal == SIGBUS) { + sigbus_triggered = true; + siglongjmp(sigbuf, 1); + } +} + /* Touch the memory while it is being madvised() */ void *touch(void *unused) { char *ptr = (char *)huge_ptr; + if (sigsetjmp(sigbuf, 1)) + return NULL; + for (int i = 0; i < INLOOP_ITER; i++) ptr[0] = '.'; @@ -44,13 +60,23 @@ int main(void) * interactions */ int max = 10000; + int err; + + ksft_print_header(); + ksft_set_plan(1); srand(getpid()); + if (signal(SIGBUS, signal_handler) == SIG_ERR) + ksft_exit_skip("Could not register signal handler."); + huge_page_size = default_huge_page_size(); if (!huge_page_size) ksft_exit_skip("Could not detect default hugetlb page size."); + ksft_print_msg("[INFO] detected default hugetlb page size: %zu KiB\n", + huge_page_size / 1024); + free_hugepages = get_free_hugepages(); if (free_hugepages != 1) { ksft_exit_skip("This test needs one and only one page to execute. Got %lu\n", @@ -73,5 +99,11 @@ int main(void) munmap(huge_ptr, huge_page_size); } - return KSFT_PASS; + ksft_test_result(!sigbus_triggered, "SIGBUS behavior\n"); + + err = ksft_get_fail_cnt(); + if (err) + ksft_exit_fail_msg("%d out of %d tests failed\n", + err, ksft_test_num()); + ksft_exit_pass(); } From 021781b01275c07cd5b7d3e4e8afc2bdf2429a84 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 26 Sep 2024 16:10:19 +0100 Subject: [PATCH 022/215] mm/madvise: unrestrict process_madvise() for current process The process_madvise() call was introduced in commit ecb8ac8b1f14 ("mm/madvise: introduce process_madvise() syscall: an external memory hinting API") as a means of performing madvise() operations on another process. However, as it provides the means by which to perform multiple madvise() operations in a batch via an iovec, it is useful to utilise the same interface for performing operations on the current process rather than a remote one. Commit 22af8caff7d1 ("mm/madvise: process_madvise() drop capability check if same mm") removed the need for a caller invoking process_madvise() on its own pidfd to possess the CAP_SYS_NICE capability, however this leaves the restrictions on operation in place. Resolve this by only applying the restriction on operations when accessing a remote process. Moving forward we plan to implement a simpler means of specifying this condition other than needing to establish a self pidfd, perhaps in the form of a sentinel pidfd. Also take the opportunity to refactor the system call implementation abstracting the vectorised operation. Link: https://lkml.kernel.org/r/20240926151019.82902-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Shakeel Butt Acked-by: Vlastimil Babka Cc: Arnd Bergmann Cc: Christian Brauner Cc: "Liam R. Howlett" Cc: Minchan Kim Cc: Pedro Falcato Cc: Suren Baghdasaryan Signed-off-by: Andrew Morton --- mm/madvise.c | 55 ++++++++++++++++++++++++++++++++++------------------ 1 file changed, 36 insertions(+), 19 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index 50d223ab3894f..e871a72a6c329 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1208,7 +1208,8 @@ madvise_behavior_valid(int behavior) } } -static bool process_madvise_behavior_valid(int behavior) +/* Can we invoke process_madvise() on a remote mm for the specified behavior? */ +static bool process_madvise_remote_valid(int behavior) { switch (behavior) { case MADV_COLD: @@ -1477,6 +1478,28 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) return do_madvise(current->mm, start, len_in, behavior); } +/* Perform an madvise operation over a vector of addresses and lengths. */ +static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, + int behavior) +{ + ssize_t ret = 0; + size_t total_len; + + total_len = iov_iter_count(iter); + + while (iov_iter_count(iter)) { + ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), + iter_iov_len(iter), behavior); + if (ret < 0) + break; + iov_iter_advance(iter, iter_iov_len(iter)); + } + + ret = (total_len - iov_iter_count(iter)) ? : ret; + + return ret; +} + SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, size_t, vlen, int, behavior, unsigned int, flags) { @@ -1486,7 +1509,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, struct iov_iter iter; struct task_struct *task; struct mm_struct *mm; - size_t total_len; unsigned int f_flags; if (flags != 0) { @@ -1504,11 +1526,6 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto free_iov; } - if (!process_madvise_behavior_valid(behavior)) { - ret = -EINVAL; - goto release_task; - } - /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); if (IS_ERR(mm)) { @@ -1516,26 +1533,26 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, goto release_task; } + /* + * We need only perform this check if we are attempting to manipulate a + * remote process's address space. + */ + if (mm != current->mm && !process_madvise_remote_valid(behavior)) { + ret = -EINVAL; + goto release_mm; + } + /* * Require CAP_SYS_NICE for influencing process performance. Note that - * only non-destructive hints are currently supported. + * only non-destructive hints are currently supported for remote + * processes. */ if (mm != current->mm && !capable(CAP_SYS_NICE)) { ret = -EPERM; goto release_mm; } - total_len = iov_iter_count(&iter); - - while (iov_iter_count(&iter)) { - ret = do_madvise(mm, (unsigned long)iter_iov_addr(&iter), - iter_iov_len(&iter), behavior); - if (ret < 0) - break; - iov_iter_advance(&iter, iter_iov_len(&iter)); - } - - ret = (total_len - iov_iter_count(&iter)) ? : ret; + ret = vector_madvise(mm, &iter, behavior); release_mm: mmput(mm); From f2f484085ef1a2bb5aea861a06bc6b4dc50d2ab8 Mon Sep 17 00:00:00 2001 From: Nanyong Sun Date: Thu, 26 Sep 2024 15:49:22 +0800 Subject: [PATCH 023/215] mm: move mm flags to mm_types.h The types of mm flags are now far beyond the core dump related features. This patch moves mm flags from linux/sched/coredump.h to linux/mm_types.h. The linux/sched/coredump.h has include the mm_types.h, so the C files related to coredump does not need to change head file inclusion. In addition, the inclusion of sched/coredump.h now can be deleted from the C files that irrelevant to core dump. Link: https://lkml.kernel.org/r/20240926074922.2721274-1-sunnanyong@huawei.com Signed-off-by: Nanyong Sun Cc: Kefeng Wang Cc: Masami Hiramatsu Cc: Matthew Wilcox Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 1 - include/linux/khugepaged.h | 2 - include/linux/ksm.h | 1 - include/linux/mm_types.h | 84 ++++++++++++++++++++++++++++++++++ include/linux/oom.h | 1 - include/linux/sched/coredump.h | 82 --------------------------------- kernel/events/uprobes.c | 1 - kernel/fork.c | 1 - mm/huge_memory.c | 1 - mm/khugepaged.c | 1 - mm/ksm.c | 1 - mm/memory.c | 1 - mm/oom_kill.c | 1 - 13 files changed, 84 insertions(+), 94 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index ef5b80e48599c..8afe09a2cf03b 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -2,7 +2,6 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -#include #include #include /* only for vma_is_dax() */ diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 30baae91b2255..1f46046080f51 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -2,8 +2,6 @@ #ifndef _LINUX_KHUGEPAGED_H #define _LINUX_KHUGEPAGED_H -#include /* MMF_VM_HUGEPAGE */ - extern unsigned int khugepaged_max_ptes_none __read_mostly; #ifdef CONFIG_TRANSPARENT_HUGEPAGE extern struct attribute_group khugepaged_attr_group; diff --git a/include/linux/ksm.h b/include/linux/ksm.h index ec9c05044d4fe..29022e71a074a 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -13,7 +13,6 @@ #include #include #include -#include #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6e3bdf8e38bca..ff8627acbaa70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -1499,4 +1499,88 @@ enum { /* See also internal only FOLL flags in mm/internal.h */ }; +/* mm flags */ + +/* + * The first two bits represent core dump modes for set-user-ID, + * the modes are SUID_DUMP_* defined in linux/sched/coredump.h + */ +#define MMF_DUMPABLE_BITS 2 +#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) +/* coredump filter bits */ +#define MMF_DUMP_ANON_PRIVATE 2 +#define MMF_DUMP_ANON_SHARED 3 +#define MMF_DUMP_MAPPED_PRIVATE 4 +#define MMF_DUMP_MAPPED_SHARED 5 +#define MMF_DUMP_ELF_HEADERS 6 +#define MMF_DUMP_HUGETLB_PRIVATE 7 +#define MMF_DUMP_HUGETLB_SHARED 8 +#define MMF_DUMP_DAX_PRIVATE 9 +#define MMF_DUMP_DAX_SHARED 10 + +#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS +#define MMF_DUMP_FILTER_BITS 9 +#define MMF_DUMP_FILTER_MASK \ + (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) +#define MMF_DUMP_FILTER_DEFAULT \ + ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ + (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) + +#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS +# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) +#else +# define MMF_DUMP_MASK_DEFAULT_ELF 0 +#endif + /* leave room for more dump flags */ +#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when mm is available for khugepaged */ + +/* + * This one-shot flag is dropped due to necessity of changing exe once again + * on NFS restore + */ +//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ + +#define MMF_HAS_UPROBES 19 /* has uprobes */ +#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ +#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ +#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ +#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ +#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ +#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ +/* + * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either + * replaced in the future by mm.pinned_vm when it becomes stable, or grow into + * a counter on its own. We're aggresive on this bit for now: even if the + * pinned pages were unpinned later on, we'll still keep this bit set for the + * lifecycle of this mm, just for simplicity. + */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ + +#define MMF_HAS_MDWE 28 +#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) + + +#define MMF_HAS_MDWE_NO_INHERIT 29 + +#define MMF_VM_MERGE_ANY 30 +#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) + +#define MMF_TOPDOWN 31 /* mm searches top down by default */ +#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) + +#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ + MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ + MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) + +static inline unsigned long mmf_init_flags(unsigned long flags) +{ + if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) + flags &= ~((1UL << MMF_HAS_MDWE) | + (1UL << MMF_HAS_MDWE_NO_INHERIT)); + return flags & MMF_INIT_MASK; +} + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/linux/oom.h b/include/linux/oom.h index 7d0c9c48a0c54..1e0fc6931ce96 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -7,7 +7,6 @@ #include #include #include -#include /* MMF_* */ #include /* VM_FAULT* */ struct zonelist; diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index e62ff805cfc95..6eb65ceed213e 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -8,12 +8,6 @@ #define SUID_DUMP_USER 1 /* Dump as user of process */ #define SUID_DUMP_ROOT 2 /* Dump as root */ -/* mm flags */ - -/* for SUID_DUMP_* above */ -#define MMF_DUMPABLE_BITS 2 -#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1) - extern void set_dumpable(struct mm_struct *mm, int value); /* * This returns the actual value of the suid_dumpable flag. For things @@ -31,80 +25,4 @@ static inline int get_dumpable(struct mm_struct *mm) return __get_dumpable(mm->flags); } -/* coredump filter bits */ -#define MMF_DUMP_ANON_PRIVATE 2 -#define MMF_DUMP_ANON_SHARED 3 -#define MMF_DUMP_MAPPED_PRIVATE 4 -#define MMF_DUMP_MAPPED_SHARED 5 -#define MMF_DUMP_ELF_HEADERS 6 -#define MMF_DUMP_HUGETLB_PRIVATE 7 -#define MMF_DUMP_HUGETLB_SHARED 8 -#define MMF_DUMP_DAX_PRIVATE 9 -#define MMF_DUMP_DAX_SHARED 10 - -#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS -#define MMF_DUMP_FILTER_BITS 9 -#define MMF_DUMP_FILTER_MASK \ - (((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT) -#define MMF_DUMP_FILTER_DEFAULT \ - ((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\ - (1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF) - -#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS -# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS) -#else -# define MMF_DUMP_MASK_DEFAULT_ELF 0 -#endif - /* leave room for more dump flags */ -#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ -#define MMF_VM_HUGEPAGE 17 /* set when mm is available for - khugepaged */ -/* - * This one-shot flag is dropped due to necessity of changing exe once again - * on NFS restore - */ -//#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ - -#define MMF_HAS_UPROBES 19 /* has uprobes */ -#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */ -#define MMF_OOM_SKIP 21 /* mm is of no interest for the OOM killer */ -#define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ -#define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ -#define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) -#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ -/* - * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either - * replaced in the future by mm.pinned_vm when it becomes stable, or grow into - * a counter on its own. We're aggresive on this bit for now: even if the - * pinned pages were unpinned later on, we'll still keep this bit set for the - * lifecycle of this mm, just for simplicity. - */ -#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ - -#define MMF_HAS_MDWE 28 -#define MMF_HAS_MDWE_MASK (1 << MMF_HAS_MDWE) - - -#define MMF_HAS_MDWE_NO_INHERIT 29 - -#define MMF_VM_MERGE_ANY 30 -#define MMF_VM_MERGE_ANY_MASK (1 << MMF_VM_MERGE_ANY) - -#define MMF_TOPDOWN 31 /* mm searches top down by default */ -#define MMF_TOPDOWN_MASK (1 << MMF_TOPDOWN) - -#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ - MMF_DISABLE_THP_MASK | MMF_HAS_MDWE_MASK |\ - MMF_VM_MERGE_ANY_MASK | MMF_TOPDOWN_MASK) - -static inline unsigned long mmf_init_flags(unsigned long flags) -{ - if (flags & (1UL << MMF_HAS_MDWE_NO_INHERIT)) - flags &= ~((1UL << MMF_HAS_MDWE) | - (1UL << MMF_HAS_MDWE_NO_INHERIT)); - return flags & MMF_INIT_MASK; -} - #endif /* _LINUX_SCHED_COREDUMP_H */ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4b52cb2ae6d62..75ac18a3ac0ff 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include /* anon_vma_prepare */ #include diff --git a/kernel/fork.c b/kernel/fork.c index b2ab422f62309..61a4abd628f35 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fd4bc39ea15..e71b58d84cba2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -8,7 +8,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 0bd80e134010f..ed1a225dd198e 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -4,7 +4,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/ksm.c b/mm/ksm.c index a2e2a521df0ae..dec536d6d91ad 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/memory.c b/mm/memory.c index bdf77a3ec47bc..c8d5d040d6ab1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -44,7 +44,6 @@ #include #include #include -#include #include #include #include diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4d7a0004df2ca..1c485beb0b934 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -24,7 +24,6 @@ #include #include #include -#include #include #include #include From 66efef9b1a7d6cc725efa9395fb390483ad5b555 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:14 +0800 Subject: [PATCH 024/215] mm: pgtable: introduce pte_offset_map_{ro|rw}_nolock() Patch series "introduce pte_offset_map_{ro|rw}_nolock()", v5. As proposed by David Hildenbrand [1], this series introduces the following two new helper functions to replace pte_offset_map_nolock(). 1. pte_offset_map_ro_nolock() 2. pte_offset_map_rw_nolock() As the name suggests, pte_offset_map_ro_nolock() is used for read-only case. In this case, only read-only operations will be performed on PTE page after the PTL is held. The RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. Therefore pte_offset_map_ro_nolock() is just a renamed version of pte_offset_map_nolock(). pte_offset_map_rw_nolock() is used for may-write case. In this case, the pte or pmd entry may be modified after the PTL is held, so we need to ensure that the pmd entry has not been modified concurrently. So in addition to the name change, it also outputs the pmdval when successful. The users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. This series will convert all pte_offset_map_nolock() into the above two helper functions one by one, and finally completely delete it. This also a preparation for reclaiming the empty user PTE page table pages. This patch (of 13): Currently, the usage of pte_offset_map_nolock() can be divided into the following two cases: 1) After acquiring PTL, only read-only operations are performed on the PTE page. In this case, the RCU lock in pte_offset_map_nolock() will ensure that the PTE page will not be freed, and there is no need to worry about whether the pmd entry is modified. 2) After acquiring PTL, the pte or pmd entries may be modified. At this time, we need to ensure that the pmd entry has not been modified concurrently. To more clearing distinguish between these two cases, this commit introduces two new helper functions to replace pte_offset_map_nolock(). For 1), just rename it to pte_offset_map_ro_nolock(). For 2), in addition to changing the name to pte_offset_map_rw_nolock(), it also outputs the pmdval when successful. It is applicable for may-write cases where any modification operations to the page table may happen after the corresponding spinlock is held afterwards. But the users should make sure the page table is stable like checking pte_same() or checking pmd_same() by using the output pmdval before performing the write operations. Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will be read-only/read-write protected. Subsequent commits will convert pte_offset_map_nolock() into the above two functions one by one, and finally completely delete it. Link: https://lkml.kernel.org/r/cover.1727332572.git.zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/5aeecfa131600a454b1f3a038a1a54282ca3b856.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 7 ++++ include/linux/mm.h | 5 +++ mm/pgtable-generic.c | 48 ++++++++++++++++++++++ 3 files changed, 60 insertions(+) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index e4f6972eb6c04..08d0e706a32db 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -19,6 +19,13 @@ There are helpers to lock/unlock a table and other accessor functions: - pte_offset_map_nolock() maps PTE, returns pointer to PTE with pointer to its PTE table lock (not taken), or returns NULL if no PTE table; + - pte_offset_map_ro_nolock() + maps PTE, returns pointer to PTE with pointer to its PTE table + lock (not taken), or returns NULL if no PTE table; + - pte_offset_map_rw_nolock() + maps PTE, returns pointer to PTE with pointer to its PTE table + lock (not taken) and the value of its pmd entry, or returns NULL + if no PTE table; - pte_offset_map() maps PTE, returns pointer to PTE, or returns NULL if no PTE table; - pte_unmap() diff --git a/include/linux/mm.h b/include/linux/mm.h index 61fff5d34ed53..0cf45d4b72866 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3017,6 +3017,11 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp); +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp); #define pte_unmap_unlock(pte, ptl) do { \ spin_unlock(ptl); \ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index a78a4adf711ac..daa08b91ab6b2 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -317,6 +317,31 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, return pte; } +pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, spinlock_t **ptlp) +{ + pmd_t pmdval; + pte_t *pte; + + pte = __pte_offset_map(pmd, addr, &pmdval); + if (likely(pte)) + *ptlp = pte_lockptr(mm, &pmdval); + return pte; +} + +pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, + unsigned long addr, pmd_t *pmdvalp, + spinlock_t **ptlp) +{ + pte_t *pte; + + VM_WARN_ON_ONCE(!pmdvalp); + pte = __pte_offset_map(pmd, addr, pmdvalp); + if (likely(pte)) + *ptlp = pte_lockptr(mm, pmdvalp); + return pte; +} + /* * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation * __pte_offset_map_lock() below, is usually called with the pmd pointer for @@ -356,6 +381,29 @@ pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, * recheck *pmd once the lock is taken; in practice, no callsite needs that - * either the mmap_lock for write, or pte_same() check on contents, is enough. * + * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); + * but when successful, it also outputs a pointer to the spinlock in ptlp - as + * pte_offset_map_lock() does, but in this case without locking it. This helps + * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time + * act on a changed *pmd: pte_offset_map_ro_nolock() provides the correct spinlock + * pointer for the page table that it returns. Even after grabbing the spinlock, + * we might be looking either at a page table that is still mapped or one that + * was unmapped and is about to get freed. But for R/O access this is sufficient. + * So it is only applicable for read-only cases where any modification operations + * to the page table are not allowed even if the corresponding spinlock is held + * afterwards. + * + * pte_offset_map_rw_nolock(mm, pmd, addr, pmdvalp, ptlp), above, is like + * pte_offset_map_ro_nolock(); but when successful, it also outputs the pdmval. + * It is applicable for may-write cases where any modification operations to the + * page table may happen after the corresponding spinlock is held afterwards. + * But the users should make sure the page table is stable like checking pte_same() + * or checking pmd_same() by using the output pmdval before performing the write + * operations. + * + * Note: "RO" / "RW" expresses the intended semantics, not that the *kmap* will + * be read-only/read-write protected. + * * Note that free_pgtables(), used after unmapping detached vmas, or when * exiting the whole mm, does not take page table lock before freeing a page * table, and may not use RCU at all: "outsiders" like khugepaged should avoid From 7aefa59899e576db093ff077fd1ebd0d1b748f33 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:15 +0800 Subject: [PATCH 025/215] powerpc: assert_pte_locked() use pte_offset_map_ro_nolock() In assert_pte_locked(), we just get the ptl and assert if it was already held, so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/42559e042eb6fc3129a40f710d671712030646b4.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/powerpc/mm/pgtable.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 7316396e452d8..61df5aed79894 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -398,7 +398,7 @@ void assert_pte_locked(struct mm_struct *mm, unsigned long addr) */ if (pmd_none(*pmd)) return; - pte = pte_offset_map_nolock(mm, pmd, addr, &ptl); + pte = pte_offset_map_ro_nolock(mm, pmd, addr, &ptl); BUG_ON(!pte); assert_spin_locked(ptl); pte_unmap(pte); From bd6ad65ddcbb2d0aceb843d31d4f1bd8d628200a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:16 +0800 Subject: [PATCH 026/215] mm: filemap: filemap_fault_recheck_pte_none() use pte_offset_map_ro_nolock() In filemap_fault_recheck_pte_none(), we just do pte_none() check, so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/9f7cbbaa772385ced1b8931b67a8b9d246c9b82d.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/filemap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 36d22968be9a1..630a1c431ea15 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3258,8 +3258,8 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf) if (!(vmf->flags & FAULT_FLAG_ORIG_PTE_VALID)) return 0; - ptep = pte_offset_map_nolock(vma->vm_mm, vmf->pmd, vmf->address, - &vmf->ptl); + ptep = pte_offset_map_ro_nolock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); if (unlikely(!ptep)) return VM_FAULT_NOPAGE; From c85507857bb8904f8631b3a89b19aa73b1f77e48 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:17 +0800 Subject: [PATCH 027/215] mm: khugepaged: __collapse_huge_page_swapin() use pte_offset_map_ro_nolock() In __collapse_huge_page_swapin(), we just use the ptl for pte_same() check in do_swap_page(). In other places, we directly use pte_offset_map_lock(), so convert it to using pte_offset_map_ro_nolock(). Link: https://lkml.kernel.org/r/dc97a6c3cb9ea80cab30c5626eeea79959d93258.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ed1a225dd198e..8e0d05bd3d560 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1014,7 +1014,11 @@ static int __collapse_huge_page_swapin(struct mm_struct *mm, }; if (!pte++) { - pte = pte_offset_map_nolock(mm, pmd, address, &ptl); + /* + * Here the ptl is only used to check pte_same() in + * do_swap_page(), so readonly version is enough. + */ + pte = pte_offset_map_ro_nolock(mm, pmd, address, &ptl); if (!pte) { mmap_read_unlock(mm); result = SCAN_PMD_NULL; From fc9c45b71f43cafcc0435dd4c7a2d3b99955a0fa Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:18 +0800 Subject: [PATCH 028/215] arm: adjust_pte() use pte_offset_map_rw_nolock() In do_adjust_pte(), we may modify the pte entry. The corresponding pmd entry may have been modified concurrently. Therefore, in order to ensure the stability if pmd entry, use pte_offset_map_rw_nolock() to replace pte_offset_map_nolock(), and do pmd_same() check after holding the PTL. All callers of update_mmu_cache_range() hold the vmf->ptl, so we can determined whether split PTE locks is being used by doing the following, just as we do elsewhere in the kernel. ptl != vmf->ptl And then we can delete the do_pte_lock() and do_pte_unlock(). Link: https://lkml.kernel.org/r/0eaf6b69aeb2fe35092a633fed12537efe645303.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- arch/arm/mm/fault-armv.c | 53 +++++++++++++++++----------------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/arch/arm/mm/fault-armv.c b/arch/arm/mm/fault-armv.c index 831793cd6ff94..2bec87c3327d2 100644 --- a/arch/arm/mm/fault-armv.c +++ b/arch/arm/mm/fault-armv.c @@ -61,32 +61,8 @@ static int do_adjust_pte(struct vm_area_struct *vma, unsigned long address, return ret; } -#if defined(CONFIG_SPLIT_PTE_PTLOCKS) -/* - * If we are using split PTE locks, then we need to take the page - * lock here. Otherwise we are using shared mm->page_table_lock - * which is already locked, thus cannot take it. - */ -static inline void do_pte_lock(spinlock_t *ptl) -{ - /* - * Use nested version here to indicate that we are already - * holding one similar spinlock. - */ - spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); -} - -static inline void do_pte_unlock(spinlock_t *ptl) -{ - spin_unlock(ptl); -} -#else /* !defined(CONFIG_SPLIT_PTE_PTLOCKS) */ -static inline void do_pte_lock(spinlock_t *ptl) {} -static inline void do_pte_unlock(spinlock_t *ptl) {} -#endif /* defined(CONFIG_SPLIT_PTE_PTLOCKS) */ - static int adjust_pte(struct vm_area_struct *vma, unsigned long address, - unsigned long pfn) + unsigned long pfn, struct vm_fault *vmf) { spinlock_t *ptl; pgd_t *pgd; @@ -94,6 +70,7 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, pud_t *pud; pmd_t *pmd; pte_t *pte; + pmd_t pmdval; int ret; pgd = pgd_offset(vma->vm_mm, address); @@ -112,20 +89,33 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, if (pmd_none_or_clear_bad(pmd)) return 0; +again: /* * This is called while another page table is mapped, so we * must use the nested version. This also means we need to * open-code the spin-locking. */ - pte = pte_offset_map_nolock(vma->vm_mm, pmd, address, &ptl); + pte = pte_offset_map_rw_nolock(vma->vm_mm, pmd, address, &pmdval, &ptl); if (!pte) return 0; - do_pte_lock(ptl); + /* + * If we are using split PTE locks, then we need to take the page + * lock here. Otherwise we are using shared mm->page_table_lock + * which is already locked, thus cannot take it. + */ + if (ptl != vmf->ptl) { + spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + goto again; + } + } ret = do_adjust_pte(vma, address, pfn, pte); - do_pte_unlock(ptl); + if (ptl != vmf->ptl) + spin_unlock(ptl); pte_unmap(pte); return ret; @@ -133,7 +123,8 @@ static int adjust_pte(struct vm_area_struct *vma, unsigned long address, static void make_coherent(struct address_space *mapping, struct vm_area_struct *vma, - unsigned long addr, pte_t *ptep, unsigned long pfn) + unsigned long addr, pte_t *ptep, unsigned long pfn, + struct vm_fault *vmf) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *mpnt; @@ -160,7 +151,7 @@ make_coherent(struct address_space *mapping, struct vm_area_struct *vma, if (!(mpnt->vm_flags & VM_MAYSHARE)) continue; offset = (pgoff - mpnt->vm_pgoff) << PAGE_SHIFT; - aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn); + aliases += adjust_pte(mpnt, mpnt->vm_start + offset, pfn, vmf); } flush_dcache_mmap_unlock(mapping); if (aliases) @@ -203,7 +194,7 @@ void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, __flush_dcache_folio(mapping, folio); if (mapping) { if (cache_is_vivt()) - make_coherent(mapping, vma, addr, ptep, pfn); + make_coherent(mapping, vma, addr, ptep, pfn, vmf); else if (vma->vm_flags & VM_EXEC) __flush_icache_all(); } From d9c1ddf37b4c287597a4578e70d19ed68d536be8 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:19 +0800 Subject: [PATCH 029/215] mm: handle_pte_fault() use pte_offset_map_rw_nolock() In handle_pte_fault(), we may modify the vmf->pte after acquiring the vmf->ptl, so convert it to using pte_offset_map_rw_nolock(). But since we will do the pte_same() check, so there is no need to get pmdval to do pmd_same() check, just pass a dummy variable to it. Link: https://lkml.kernel.org/r/af8d694853b44c5a6018403ae435440e275854c7.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Acked-by: David Hildenbrand Reviewed-by: Muchun Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index c8d5d040d6ab1..ce5cd8d4c4011 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5742,14 +5742,24 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf) vmf->pte = NULL; vmf->flags &= ~FAULT_FLAG_ORIG_PTE_VALID; } else { + pmd_t dummy_pmdval; + /* * A regular pmd is established and it can't morph into a huge * pmd by anon khugepaged, since that takes mmap_lock in write * mode; but shmem or file collapse to THP could still morph * it into a huge pmd: just retry later if so. + * + * Use the maywrite version to indicate that vmf->pte may be + * modified, but since we will use pte_same() to detect the + * change of the !pte_none() entry, there is no need to recheck + * the pmdval. Here we chooes to pass a dummy variable instead + * of NULL, which helps new user think about why this place is + * special. */ - vmf->pte = pte_offset_map_nolock(vmf->vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); + vmf->pte = pte_offset_map_rw_nolock(vmf->vma->vm_mm, vmf->pmd, + vmf->address, &dummy_pmdval, + &vmf->ptl); if (unlikely(!vmf->pte)) return 0; vmf->orig_pte = ptep_get_lockless(vmf->pte); From 6dfd0d2cb3691040979ddbd6c758956694a3185d Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:20 +0800 Subject: [PATCH 030/215] mm: khugepaged: collapse_pte_mapped_thp() use pte_offset_map_rw_nolock() In collapse_pte_mapped_thp(), we may modify the pte and pmd entry after acquiring the ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the PTL held. So we should get pgt_pmd and do pmd_same() check after the ptl held. Link: https://lkml.kernel.org/r/055e42db68da00ac8ecab94bd2633c7cd965eb1c.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 8e0d05bd3d560..6f8d46d107b4b 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1608,7 +1608,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, if (userfaultfd_armed(vma) && !(vma->vm_flags & VM_SHARED)) pml = pmd_lock(mm, pmd); - start_pte = pte_offset_map_nolock(mm, pmd, haddr, &ptl); + start_pte = pte_offset_map_rw_nolock(mm, pmd, haddr, &pgt_pmd, &ptl); if (!start_pte) /* mmap_lock + page lock should prevent this */ goto abort; if (!pml) @@ -1616,6 +1616,9 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, else if (ptl != pml) spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) + goto abort; + /* step 2: clear page table and adjust rmap */ for (i = 0, addr = haddr, pte = start_pte; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE, pte++) { @@ -1648,7 +1651,6 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, nr_ptes++; } - pte_unmap(start_pte); if (!pml) spin_unlock(ptl); @@ -1661,14 +1663,19 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, /* step 4: remove empty page table */ if (!pml) { pml = pmd_lock(mm, pmd); - if (ptl != pml) + if (ptl != pml) { spin_lock_nested(ptl, SINGLE_DEPTH_NESTING); + if (unlikely(!pmd_same(pgt_pmd, pmdp_get_lockless(pmd)))) { + flush_tlb_mm(mm); + goto unlock; + } + } } pgt_pmd = pmdp_collapse_flush(vma, haddr, pmd); pmdp_get_lockless_sync(); + pte_unmap_unlock(start_pte, ptl); if (ptl != pml) - spin_unlock(ptl); - spin_unlock(pml); + spin_unlock(pml); mmu_notifier_invalidate_range_end(&range); @@ -1688,6 +1695,7 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, folio_ref_sub(folio, nr_ptes); add_mm_counter(mm, mm_counter_file(folio), -nr_ptes); } +unlock: if (start_pte) pte_unmap_unlock(start_pte, ptl); if (pml && pml != ptl) From 24553a978b6fbd96fcb83c897c23569351ddebe2 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:21 +0800 Subject: [PATCH 031/215] mm: copy_pte_range() use pte_offset_map_rw_nolock() In copy_pte_range(), we may modify the src_pte entry after holding the src_ptl, so convert it to using pte_offset_map_rw_nolock(). Since we already hold the exclusive mmap_lock, and the copy_pte_range() and retract_page_tables() are using vma->anon_vma to be exclusive, so the PTE page is stable, there is no need to get pmdval and do pmd_same() check. Link: https://lkml.kernel.org/r/9166f6fad806efbca72e318ab6f0f8af458056a9.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index ce5cd8d4c4011..6bda739a60e8b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1084,6 +1084,7 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, struct mm_struct *src_mm = src_vma->vm_mm; pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; + pmd_t dummy_pmdval; pte_t ptent; spinlock_t *src_ptl, *dst_ptl; int progress, max_nr, ret = 0; @@ -1109,7 +1110,15 @@ copy_pte_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma, ret = -ENOMEM; goto out; } - src_pte = pte_offset_map_nolock(src_mm, src_pmd, addr, &src_ptl); + + /* + * We already hold the exclusive mmap_lock, the copy_pte_range() and + * retract_page_tables() are using vma->anon_vma to be exclusive, so + * the PTE page is stable, and there is no need to get pmdval and do + * pmd_same() check. + */ + src_pte = pte_offset_map_rw_nolock(src_mm, src_pmd, addr, &dummy_pmdval, + &src_ptl); if (!src_pte) { pte_unmap_unlock(dst_pte, dst_ptl); /* ret == 0 */ From 838d02354464c301fcddf4f524365846608ac296 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:22 +0800 Subject: [PATCH 032/215] mm: mremap: move_ptes() use pte_offset_map_rw_nolock() In move_ptes(), we may modify the new_pte after acquiring the new_ptl, so convert it to using pte_offset_map_rw_nolock(). Now new_pte is none, so hpage_collapse_scan_file() path can not find this by traversing file->f_mapping, so there is no concurrency with retract_page_tables(). In addition, we already hold the exclusive mmap_lock, so this new_pte page is stable, so there is no need to get pmdval and do pmd_same() check. Link: https://lkml.kernel.org/r/9d582a09dbcf12e562ac5fe0ba05e9248a58f5e0.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/mremap.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/mremap.c b/mm/mremap.c index dda09e957a5d4..5917feafe8cc5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -140,6 +140,7 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, { struct mm_struct *mm = vma->vm_mm; pte_t *old_pte, *new_pte, pte; + pmd_t dummy_pmdval; spinlock_t *old_ptl, *new_ptl; bool force_flush = false; unsigned long len = old_end - old_addr; @@ -175,7 +176,15 @@ static int move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, err = -EAGAIN; goto out; } - new_pte = pte_offset_map_nolock(mm, new_pmd, new_addr, &new_ptl); + /* + * Now new_pte is none, so hpage_collapse_scan_file() path can not find + * this by traversing file->f_mapping, so there is no concurrency with + * retract_page_tables(). In addition, we already hold the exclusive + * mmap_lock, so this new_pte page is stable, so there is no need to get + * pmdval and do pmd_same() check. + */ + new_pte = pte_offset_map_rw_nolock(mm, new_pmd, new_addr, &dummy_pmdval, + &new_ptl); if (!new_pte) { pte_unmap_unlock(old_pte, old_ptl); err = -EAGAIN; From 04965da7a4af790d99c360e79b00bd1f93f80eb1 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:23 +0800 Subject: [PATCH 033/215] mm: page_vma_mapped_walk: map_pte() use pte_offset_map_rw_nolock() In the caller of map_pte(), we may modify the pvmw->pte after acquiring the pvmw->ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the pvmw->ptl held, so we should get pmdval and do pmd_same() check to ensure the stability of pvmw->pmd. Link: https://lkml.kernel.org/r/2620a48f34c9f19864ab0169cdbf253d31a8fcaa.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/page_vma_mapped.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ae5cc42aa2087..ab1671e71cb2d 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -13,7 +13,8 @@ static inline bool not_found(struct page_vma_mapped_walk *pvmw) return false; } -static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) +static bool map_pte(struct page_vma_mapped_walk *pvmw, pmd_t *pmdvalp, + spinlock_t **ptlp) { pte_t ptent; @@ -25,6 +26,7 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) return !!pvmw->pte; } +again: /* * It is important to return the ptl corresponding to pte, * in case *pvmw->pmd changes underneath us; so we need to @@ -32,8 +34,8 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) * proceeds to loop over next ptes, and finds a match later. * Though, in most cases, page lock already protects this. */ - pvmw->pte = pte_offset_map_nolock(pvmw->vma->vm_mm, pvmw->pmd, - pvmw->address, ptlp); + pvmw->pte = pte_offset_map_rw_nolock(pvmw->vma->vm_mm, pvmw->pmd, + pvmw->address, pmdvalp, ptlp); if (!pvmw->pte) return false; @@ -67,8 +69,13 @@ static bool map_pte(struct page_vma_mapped_walk *pvmw, spinlock_t **ptlp) } else if (!pte_present(ptent)) { return false; } + spin_lock(*ptlp); + if (unlikely(!pmd_same(*pmdvalp, pmdp_get_lockless(pvmw->pmd)))) { + pte_unmap_unlock(pvmw->pte, *ptlp); + goto again; + } pvmw->ptl = *ptlp; - spin_lock(pvmw->ptl); + return true; } @@ -278,7 +285,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) step_forward(pvmw, PMD_SIZE); continue; } - if (!map_pte(pvmw, &ptl)) { + if (!map_pte(pvmw, &pmde, &ptl)) { if (!pvmw->pte) goto restart; goto next_pte; @@ -305,8 +312,13 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) } while (pte_none(ptep_get(pvmw->pte))); if (!pvmw->ptl) { + spin_lock(ptl); + if (unlikely(!pmd_same(pmde, pmdp_get_lockless(pvmw->pmd)))) { + pte_unmap_unlock(pvmw->pte, ptl); + pvmw->pte = NULL; + goto restart; + } pvmw->ptl = ptl; - spin_lock(pvmw->ptl); } goto this_pte; } while (pvmw->address < end); From e9c74b5431632d2ca60725ffff6fc1fe2b80f246 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:24 +0800 Subject: [PATCH 034/215] mm: userfaultfd: move_pages_pte() use pte_offset_map_rw_nolock() In move_pages_pte(), we may modify the dst_pte and src_pte after acquiring the ptl, so convert it to using pte_offset_map_rw_nolock(). But since we will use pte_same() to detect the change of the pte entry, there is no need to get pmdval, so just pass a dummy variable to it. Link: https://lkml.kernel.org/r/1530e8fdbfc72eacf3b095babe139ce3d715600a.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ce13c40626472..48b87c62fc3dd 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1135,7 +1135,7 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, spinlock_t *src_ptl, *dst_ptl; pte_t *src_pte = NULL; pte_t *dst_pte = NULL; - + pmd_t dummy_pmdval; struct folio *src_folio = NULL; struct anon_vma *src_anon_vma = NULL; struct mmu_notifier_range range; @@ -1146,7 +1146,14 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, src_addr, src_addr + PAGE_SIZE); mmu_notifier_invalidate_range_start(&range); retry: - dst_pte = pte_offset_map_nolock(mm, dst_pmd, dst_addr, &dst_ptl); + /* + * Use the maywrite version to indicate that dst_pte will be modified, + * but since we will use pte_same() to detect the change of the pte + * entry, there is no need to get pmdval, so just pass a dummy variable + * to it. + */ + dst_pte = pte_offset_map_rw_nolock(mm, dst_pmd, dst_addr, &dummy_pmdval, + &dst_ptl); /* Retry if a huge pmd materialized from under us */ if (unlikely(!dst_pte)) { @@ -1154,7 +1161,9 @@ static int move_pages_pte(struct mm_struct *mm, pmd_t *dst_pmd, pmd_t *src_pmd, goto out; } - src_pte = pte_offset_map_nolock(mm, src_pmd, src_addr, &src_ptl); + /* same as dst_pte */ + src_pte = pte_offset_map_rw_nolock(mm, src_pmd, src_addr, &dummy_pmdval, + &src_ptl); /* * We held the mmap_lock for reading so MADV_DONTNEED From 2441774f2d2890940f2db21bbc264c7e2f56d1ae Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:25 +0800 Subject: [PATCH 035/215] mm: multi-gen LRU: walk_pte_range() use pte_offset_map_rw_nolock() In walk_pte_range(), we may modify the pte entry after holding the ptl, so convert it to using pte_offset_map_rw_nolock(). At this time, the pte_same() check is not performed after the ptl held, so we should get pmdval and do pmd_same() check to ensure the stability of pmd entry. Link: https://lkml.kernel.org/r/7e9c194a5efacc9609cfd31abb9c7df88b53b530.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 20dd72c98813d..8f25dd6cec54b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3386,8 +3386,10 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, struct pglist_data *pgdat = lruvec_pgdat(walk->lruvec); DEFINE_MAX_SEQ(walk->lruvec); int old_gen, new_gen = lru_gen_from_seq(max_seq); + pmd_t pmdval; - pte = pte_offset_map_nolock(args->mm, pmd, start & PMD_MASK, &ptl); + pte = pte_offset_map_rw_nolock(args->mm, pmd, start & PMD_MASK, &pmdval, + &ptl); if (!pte) return false; if (!spin_trylock(ptl)) { @@ -3395,6 +3397,11 @@ static bool walk_pte_range(pmd_t *pmd, unsigned long start, unsigned long end, return false; } + if (unlikely(!pmd_same(pmdval, pmdp_get_lockless(pmd)))) { + pte_unmap_unlock(pte, ptl); + return false; + } + arch_enter_lazy_mmu_mode(); restart: for (i = pte_index(start), addr = start; addr != end; i++, addr += PAGE_SIZE) { From 583e66debd1d5aa8c401aebe924c7406e15579a7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 26 Sep 2024 14:46:26 +0800 Subject: [PATCH 036/215] mm: pgtable: remove pte_offset_map_nolock() Now no users are using the pte_offset_map_nolock(), remove it. Link: https://lkml.kernel.org/r/d04f9bbbcde048fb6ffa6f2bdbc6f9b22d5286f9.1727332572.git.zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Reviewed-by: Muchun Song Acked-by: David Hildenbrand Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Mike Rapoport (Microsoft) Cc: Peter Xu Cc: Ryan Roberts Cc: Vishal Moola (Oracle) Signed-off-by: Andrew Morton --- Documentation/mm/split_page_table_lock.rst | 3 --- include/linux/mm.h | 2 -- mm/pgtable-generic.c | 21 --------------------- 3 files changed, 26 deletions(-) diff --git a/Documentation/mm/split_page_table_lock.rst b/Documentation/mm/split_page_table_lock.rst index 08d0e706a32db..581446d4a4eba 100644 --- a/Documentation/mm/split_page_table_lock.rst +++ b/Documentation/mm/split_page_table_lock.rst @@ -16,9 +16,6 @@ There are helpers to lock/unlock a table and other accessor functions: - pte_offset_map_lock() maps PTE and takes PTE table lock, returns pointer to PTE with pointer to its PTE table lock, or returns NULL if no PTE table; - - pte_offset_map_nolock() - maps PTE, returns pointer to PTE with pointer to its PTE table - lock (not taken), or returns NULL if no PTE table; - pte_offset_map_ro_nolock() maps PTE, returns pointer to PTE with pointer to its PTE table lock (not taken), or returns NULL if no PTE table; diff --git a/include/linux/mm.h b/include/linux/mm.h index 0cf45d4b72866..8f5394d75ce23 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3015,8 +3015,6 @@ static inline pte_t *pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, return pte; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp); pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index daa08b91ab6b2..5297dcc38c37a 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -305,18 +305,6 @@ pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) return NULL; } -pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, - unsigned long addr, spinlock_t **ptlp) -{ - pmd_t pmdval; - pte_t *pte; - - pte = __pte_offset_map(pmd, addr, &pmdval); - if (likely(pte)) - *ptlp = pte_lockptr(mm, &pmdval); - return pte; -} - pte_t *pte_offset_map_ro_nolock(struct mm_struct *mm, pmd_t *pmd, unsigned long addr, spinlock_t **ptlp) { @@ -372,15 +360,6 @@ pte_t *pte_offset_map_rw_nolock(struct mm_struct *mm, pmd_t *pmd, * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s * afterwards. * - * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); - * but when successful, it also outputs a pointer to the spinlock in ptlp - as - * pte_offset_map_lock() does, but in this case without locking it. This helps - * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time - * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock - * pointer for the page table that it returns. In principle, the caller should - * recheck *pmd once the lock is taken; in practice, no callsite needs that - - * either the mmap_lock for write, or pte_same() check on contents, is enough. - * * pte_offset_map_ro_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); * but when successful, it also outputs a pointer to the spinlock in ptlp - as * pte_offset_map_lock() does, but in this case without locking it. This helps From 473c371254d2c9906c286c939eaa99d0fac13e38 Mon Sep 17 00:00:00 2001 From: Zhaoyang Huang Date: Thu, 26 Sep 2024 13:06:47 +0800 Subject: [PATCH 037/215] mm: migrate LRU_REFS_MASK bits in folio_migrate_flags Bits of LRU_REFS_MASK are not inherited during migration which lead to new folio start from tier0 when MGLRU enabled. Try to bring as much bits of folio->flags as possible since compaction and alloc_contig_range which introduce migration do happen at times. Link: https://lkml.kernel.org/r/20240926050647.5653-1-zhaoyang.huang@unisoc.com Signed-off-by: Zhaoyang Huang Suggested-by: Yu Zhao Acked-by: David Hildenbrand Cc: Matthew Wilcox Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 10 ++++++++++ mm/migrate.c | 1 + 2 files changed, 11 insertions(+) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index f4fe593c1400e..6f801c7b36e2f 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -291,6 +291,12 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return true; } +static inline void folio_migrate_refs(struct folio *new, struct folio *old) +{ + unsigned long refs = READ_ONCE(old->flags) & LRU_REFS_MASK; + + set_mask_bits(&new->flags, LRU_REFS_MASK, refs); +} #else /* !CONFIG_LRU_GEN */ static inline bool lru_gen_enabled(void) @@ -313,6 +319,10 @@ static inline bool lru_gen_del_folio(struct lruvec *lruvec, struct folio *folio, return false; } +static inline void folio_migrate_refs(struct folio *new, struct folio *old) +{ + +} #endif /* CONFIG_LRU_GEN */ static __always_inline diff --git a/mm/migrate.c b/mm/migrate.c index dfa24e41e8f95..72c6657f4f72c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -695,6 +695,7 @@ void folio_migrate_flags(struct folio *newfolio, struct folio *folio) if (folio_test_idle(folio)) folio_set_idle(newfolio); + folio_migrate_refs(newfolio, folio); /* * Copy NUMA information to the new page, to prevent over-eager * future migrations of this same page. From cb8e64be7681b857f4976378ece542b3e18a8484 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 25 Sep 2024 15:47:15 -0700 Subject: [PATCH 038/215] mm: optimize truncation of shadow entries Patch series "mm: optimize shadow entries removal", v2. Some of our production workloads which processes a large amount of data spends considerable amount of CPUs on truncation and invalidation of large sized files (100s of GiBs of size). Tracing the operations showed that most of the time is in shadow entries removal. This patch series optimizes the truncation and invalidation operations. This patch (of 2): The kernel truncates the page cache in batches of PAGEVEC_SIZE. For each batch, it traverses the page cache tree and collects the entries (folio and shadow entries) in the struct folio_batch. For the shadow entries present in the folio_batch, it has to traverse the page cache tree for each individual entry to remove them. This patch optimize this by removing them in a single tree traversal. On large machines in our production which run workloads manipulating large amount of data, we have observed that a large amount of CPUs are spent on truncation of very large files (100s of GiBs file sizes). More specifically most of time was spent on shadow entries cleanup, so optimizing the shadow entries cleanup, even a little bit, has good impact. To evaluate the changes, we created 200GiB file on a fuse fs and in a memcg. We created the shadow entries by triggering reclaim through memory.reclaim in that specific memcg and measure the simple truncation operation. # time truncate -s 0 file time (sec) Without 5.164 +- 0.059 With-patch 4.21 +- 0.066 (18.47% decrease) Link: https://lkml.kernel.org/r/20240925224716.2904498-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20240925224716.2904498-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Cc: Chris Mason Cc: Matthew Wilcox Cc: Omar Sandoval Signed-off-by: Andrew Morton --- mm/truncate.c | 53 +++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 27 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 0668cd340a463..1d51c023d9c5a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -68,54 +68,53 @@ static void clear_shadow_entries(struct address_space *mapping, * Unconditionally remove exceptional entries. Usually called from truncate * path. Note that the folio_batch may be altered by this function by removing * exceptional entries similar to what folio_batch_remove_exceptionals() does. + * Please note that indices[] has entries in ascending order as guaranteed by + * either find_get_entries() or find_lock_entries(). */ static void truncate_folio_batch_exceptionals(struct address_space *mapping, struct folio_batch *fbatch, pgoff_t *indices) { + XA_STATE(xas, &mapping->i_pages, indices[0]); + int nr = folio_batch_count(fbatch); + struct folio *folio; int i, j; - bool dax; /* Handled by shmem itself */ if (shmem_mapping(mapping)) return; - for (j = 0; j < folio_batch_count(fbatch); j++) + for (j = 0; j < nr; j++) if (xa_is_value(fbatch->folios[j])) break; - if (j == folio_batch_count(fbatch)) + if (j == nr) return; - dax = dax_mapping(mapping); - if (!dax) { - spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + if (dax_mapping(mapping)) { + for (i = j; i < nr; i++) { + if (xa_is_value(fbatch->folios[i])) + dax_delete_mapping_entry(mapping, indices[i]); + } + goto out; } - for (i = j; i < folio_batch_count(fbatch); i++) { - struct folio *folio = fbatch->folios[i]; - pgoff_t index = indices[i]; - - if (!xa_is_value(folio)) { - fbatch->folios[j++] = folio; - continue; - } + xas_set(&xas, indices[j]); + xas_set_update(&xas, workingset_update_node); - if (unlikely(dax)) { - dax_delete_mapping_entry(mapping, index); - continue; - } + spin_lock(&mapping->host->i_lock); + xas_lock_irq(&xas); - __clear_shadow_entry(mapping, index, folio); + xas_for_each(&xas, folio, indices[nr-1]) { + if (xa_is_value(folio)) + xas_store(&xas, NULL); } - if (!dax) { - xa_unlock_irq(&mapping->i_pages); - if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); - spin_unlock(&mapping->host->i_lock); - } - fbatch->nr = j; + xas_unlock_irq(&xas); + if (mapping_shrinkable(mapping)) + inode_add_lru(mapping->host); + spin_unlock(&mapping->host->i_lock); +out: + folio_batch_remove_exceptionals(fbatch); } /** From d3db2c0425915f6b0f273770feee2e2f97dba6a3 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 25 Sep 2024 15:47:16 -0700 Subject: [PATCH 039/215] mm: optimize invalidation of shadow entries The kernel invalidates the page cache in batches of PAGEVEC_SIZE. For each batch, it traverses the page cache tree and collects the entries (folio and shadow entries) in the struct folio_batch. For the shadow entries present in the folio_batch, it has to traverse the page cache tree for each individual entry to remove them. This patch optimize this by removing them in a single tree traversal. To evaluate the changes, we created 200GiB file on a fuse fs and in a memcg. We created the shadow entries by triggering reclaim through memory.reclaim in that specific memcg and measure the simple fadvise(DONTNEED) operation. # time xfs_io -c 'fadvise -d 0 ${file_size}' file time (sec) Without 5.12 +- 0.061 With-patch 4.19 +- 0.086 (18.16% decrease) Link: https://lkml.kernel.org/r/20240925224716.2904498-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Cc: Chris Mason Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Omar Sandoval Signed-off-by: Andrew Morton --- mm/truncate.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 1d51c023d9c5a..520c8cf8f58f4 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -23,42 +23,28 @@ #include #include "internal.h" -/* - * Regular page slots are stabilized by the page lock even without the tree - * itself locked. These unlocked entries need verification under the tree - * lock. - */ -static inline void __clear_shadow_entry(struct address_space *mapping, - pgoff_t index, void *entry) -{ - XA_STATE(xas, &mapping->i_pages, index); - - xas_set_update(&xas, workingset_update_node); - if (xas_load(&xas) != entry) - return; - xas_store(&xas, NULL); -} - static void clear_shadow_entries(struct address_space *mapping, - struct folio_batch *fbatch, pgoff_t *indices) + unsigned long start, unsigned long max) { - int i; + XA_STATE(xas, &mapping->i_pages, start); + struct folio *folio; /* Handled by shmem itself, or for DAX we do nothing. */ if (shmem_mapping(mapping) || dax_mapping(mapping)) return; - spin_lock(&mapping->host->i_lock); - xa_lock_irq(&mapping->i_pages); + xas_set_update(&xas, workingset_update_node); - for (i = 0; i < folio_batch_count(fbatch); i++) { - struct folio *folio = fbatch->folios[i]; + spin_lock(&mapping->host->i_lock); + xas_lock_irq(&xas); + /* Clear all shadow entries from start to max */ + xas_for_each(&xas, folio, max) { if (xa_is_value(folio)) - __clear_shadow_entry(mapping, indices[i], folio); + xas_store(&xas, NULL); } - xa_unlock_irq(&mapping->i_pages); + xas_unlock_irq(&xas); if (mapping_shrinkable(mapping)) inode_add_lru(mapping->host); spin_unlock(&mapping->host->i_lock); @@ -481,7 +467,9 @@ unsigned long mapping_try_invalidate(struct address_space *mapping, folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + int nr = folio_batch_count(&fbatch); + + for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ @@ -508,7 +496,7 @@ unsigned long mapping_try_invalidate(struct address_space *mapping, } if (xa_has_values) - clear_shadow_entries(mapping, &fbatch, indices); + clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); @@ -612,7 +600,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { - for (i = 0; i < folio_batch_count(&fbatch); i++) { + int nr = folio_batch_count(&fbatch); + + for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; /* We rely upon deletion not changing folio->index */ @@ -658,7 +648,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, } if (xa_has_values) - clear_shadow_entries(mapping, &fbatch, indices); + clear_shadow_entries(mapping, indices[0], indices[nr-1]); folio_batch_remove_exceptionals(&fbatch); folio_batch_release(&fbatch); From 1fa00a568d113db279f683f40636cf72cf73a55d Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Fri, 27 Sep 2024 23:46:37 +0530 Subject: [PATCH 040/215] mm/cma: fix useless return in void function There is a unnecessary return statement at the end of void function cma_activate_area. This can be dropped. While at it, also fix another warning related to unsigned. These are reported by checkpatch as well. WARNING: Prefer 'unsigned int' to bare use of 'unsigned' +unsigned cma_area_count; WARNING: void function return statements are not generally useful + return; +} Link: https://lkml.kernel.org/r/20240927181637.19941-1-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Cc: Pintu Agarwal Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/cma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/cma.c b/mm/cma.c index 2d9fae9392835..c5869d0001ad1 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -32,7 +32,7 @@ #include "cma.h" struct cma cma_areas[MAX_CMA_AREAS]; -unsigned cma_area_count; +unsigned int cma_area_count; static DEFINE_MUTEX(cma_mutex); phys_addr_t cma_get_base(const struct cma *cma) @@ -135,7 +135,6 @@ static void __init cma_activate_area(struct cma *cma) totalcma_pages -= cma->count; cma->count = 0; pr_err("CMA area %s could not be activated\n", cma->name); - return; } static int __init cma_init_reserved_areas(void) From 12833a732346dcf4e3bde55d6556fedf90743656 Mon Sep 17 00:00:00 2001 From: Ba Jing Date: Tue, 24 Sep 2024 10:14:26 +0800 Subject: [PATCH 041/215] selftests/damon/access_memory_even: remove unused variables By reading the code, I found these variables are never referenced in the code. Just remove them. Link: https://lkml.kernel.org/r/20240924021426.1980-1-bajing@cmss.chinamobile.com Signed-off-by: Ba Jing Reviewed-by: SeongJae Park Reviewed-by: Dev Jain Reviewed-by: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/access_memory_even.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/damon/access_memory_even.c b/tools/testing/selftests/damon/access_memory_even.c index 3be1214874325..a9f4e9aaf3a93 100644 --- a/tools/testing/selftests/damon/access_memory_even.c +++ b/tools/testing/selftests/damon/access_memory_even.c @@ -14,10 +14,8 @@ int main(int argc, char *argv[]) { char **regions; - clock_t start_clock; int nr_regions; int sz_region; - int access_time_ms; int i; if (argc != 3) { From 9c0a1b99e3919f5fddeeaf96b36f86ccc5cc2a10 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:27 +0100 Subject: [PATCH 042/215] ksm: use a folio in try_to_merge_one_page() Patch series "Remove PageKsm()". The KSM flag is almost always tested on the folio rather than on the page. This series removes the final users of PageKsm() and makes the flag only This patch (of 5): It is safe to use a folio here because all callers took a refcount on this page. The one wrinkle is that we have to recalculate the value of folio after splitting the page, since it has probably changed. Replaces nine calls to compound_head() with one. Link: https://lkml.kernel.org/r/20241002152533.1350629-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241002152533.1350629-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Alex Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index dec536d6d91ad..446de762283be 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1442,28 +1442,29 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, static int try_to_merge_one_page(struct vm_area_struct *vma, struct page *page, struct page *kpage) { + struct folio *folio = page_folio(page); pte_t orig_pte = __pte(0); int err = -EFAULT; if (page == kpage) /* ksm page forked */ return 0; - if (!PageAnon(page)) + if (!folio_test_anon(folio)) goto out; /* * We need the folio lock to read a stable swapcache flag in - * write_protect_page(). We use trylock_page() instead of - * lock_page() because we don't want to wait here - we - * prefer to continue scanning and merging different pages, - * then come back to this page when it is unlocked. + * write_protect_page(). We trylock because we don't want to wait + * here - we prefer to continue scanning and merging different + * pages, then come back to this page when it is unlocked. */ - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto out; - if (PageTransCompound(page)) { + if (folio_test_large(folio)) { if (split_huge_page(page)) goto out_unlock; + folio = page_folio(page); } /* @@ -1472,28 +1473,28 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, * ptes are necessarily already write-protected. But in either * case, we need to lock and check page_count is not raised. */ - if (write_protect_page(vma, page_folio(page), &orig_pte) == 0) { + if (write_protect_page(vma, folio, &orig_pte) == 0) { if (!kpage) { /* - * While we hold page lock, upgrade page from - * PageAnon+anon_vma to PageKsm+NULL stable_node: + * While we hold folio lock, upgrade folio from + * anon to a NULL stable_node with the KSM flag set: * stable_tree_insert() will update stable_node. */ - folio_set_stable_node(page_folio(page), NULL); - mark_page_accessed(page); + folio_set_stable_node(folio, NULL); + folio_mark_accessed(folio); /* - * Page reclaim just frees a clean page with no dirty + * Page reclaim just frees a clean folio with no dirty * ptes: make sure that the ksm page would be swapped. */ - if (!PageDirty(page)) - SetPageDirty(page); + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); err = 0; } else if (pages_identical(page, kpage)) err = replace_page(vma, page, kpage, orig_pte); } out_unlock: - unlock_page(page); + folio_unlock(folio); out: return err; } From 98c3ca0015b8a5af7f98109261bd9a471097135b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:28 +0100 Subject: [PATCH 043/215] ksm: convert cmp_and_merge_page() to use a folio By making try_to_merge_two_pages() and stable_tree_search() return a folio, we can replace kpage with kfolio. This replaces 7 calls to compound_head() with one. [cuigaosheng1@huawei.com: add IS_ERR_OR_NULL check for stable_tree_search()] Signed-off-by: Gaosheng Cui Link: https://lkml.kernel.org/r/20241002152533.1350629-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Cc: Gaosheng Cui Signed-off-by: Andrew Morton --- mm/ksm.c | 51 ++++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 446de762283be..f5957bbfcd2f6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1582,7 +1582,7 @@ static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, +static struct folio *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) @@ -1600,7 +1600,7 @@ static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, if (err) break_cow(rmap_item); } - return err ? NULL : page; + return err ? NULL : page_folio(page); } static __always_inline @@ -1787,9 +1787,9 @@ static __always_inline struct folio *chain(struct ksm_stable_node **s_n_d, * with identical content to the page that we are scanning right now. * * This function returns the stable tree node of identical content if found, - * NULL otherwise. + * -EBUSY if the stable node's page is being migrated, NULL otherwise. */ -static struct page *stable_tree_search(struct page *page) +static struct folio *stable_tree_search(struct page *page) { int nid; struct rb_root *root; @@ -1804,7 +1804,7 @@ static struct page *stable_tree_search(struct page *page) if (page_node && page_node->head != &migrate_nodes) { /* ksm page forked */ folio_get(folio); - return &folio->page; + return folio; } nid = get_kpfn_nid(folio_pfn(folio)); @@ -1899,7 +1899,7 @@ static struct page *stable_tree_search(struct page *page) folio_put(tree_folio); goto replace; } - return &tree_folio->page; + return tree_folio; } } @@ -1913,7 +1913,7 @@ static struct page *stable_tree_search(struct page *page) out: if (is_page_sharing_candidate(page_node)) { folio_get(folio); - return &folio->page; + return folio; } else return NULL; @@ -1963,7 +1963,7 @@ static struct page *stable_tree_search(struct page *page) } stable_node_dup->head = &migrate_nodes; list_add(&stable_node_dup->list, stable_node_dup->head); - return &folio->page; + return folio; chain_append: /* @@ -2217,7 +2217,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; struct ksm_stable_node *stable_node; - struct page *kpage; + struct folio *kfolio; unsigned int checksum; int err; bool max_page_sharing_bypass = false; @@ -2259,31 +2259,32 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite return; } - /* We first start with searching the page inside the stable tree */ - kpage = stable_tree_search(page); - if (kpage == page && rmap_item->head == stable_node) { - put_page(kpage); + /* Start by searching for the folio in the stable tree */ + kfolio = stable_tree_search(page); + if (!IS_ERR_OR_NULL(kfolio) && &kfolio->page == page && + rmap_item->head == stable_node) { + folio_put(kfolio); return; } remove_rmap_item_from_tree(rmap_item); - if (kpage) { - if (PTR_ERR(kpage) == -EBUSY) + if (kfolio) { + if (kfolio == ERR_PTR(-EBUSY)) return; - err = try_to_merge_with_ksm_page(rmap_item, page, kpage); + err = try_to_merge_with_ksm_page(rmap_item, page, &kfolio->page); if (!err) { /* * The page was successfully merged: * add its rmap_item to the stable tree. */ - lock_page(kpage); - stable_tree_append(rmap_item, page_stable_node(kpage), + folio_lock(kfolio); + stable_tree_append(rmap_item, folio_stable_node(kfolio), max_page_sharing_bypass); - unlock_page(kpage); + folio_unlock(kfolio); } - put_page(kpage); + folio_put(kfolio); return; } @@ -2292,7 +2293,7 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite if (tree_rmap_item) { bool split; - kpage = try_to_merge_two_pages(rmap_item, page, + kfolio = try_to_merge_two_pages(rmap_item, page, tree_rmap_item, tree_page); /* * If both pages we tried to merge belong to the same compound @@ -2307,20 +2308,20 @@ static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_ite split = PageTransCompound(page) && compound_head(page) == compound_head(tree_page); put_page(tree_page); - if (kpage) { + if (kfolio) { /* * The pages were successfully merged: insert new * node in the stable tree and add both rmap_items. */ - lock_page(kpage); - stable_node = stable_tree_insert(page_folio(kpage)); + folio_lock(kfolio); + stable_node = stable_tree_insert(kfolio); if (stable_node) { stable_tree_append(tree_rmap_item, stable_node, false); stable_tree_append(rmap_item, stable_node, false); } - unlock_page(kpage); + folio_unlock(kfolio); /* * If we fail to insert the page into the stable tree, From 76f1a8261188dfbc46d2957e2bb98dd5f007da7c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:29 +0100 Subject: [PATCH 044/215] ksm: convert should_skip_rmap_item() to take a folio Remove a call to PageKSM() by passing the folio containing tmp_page to should_skip_rmap_item. Removes a hidden call to compound_head(). Link: https://lkml.kernel.org/r/20241002152533.1350629-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f5957bbfcd2f6..b1c5c8aff41b7 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2402,10 +2402,10 @@ static unsigned int skip_age(rmap_age_t age) /* * Determines if a page should be skipped for the current scan. * - * @page: page to check + * @folio: folio containing the page to check * @rmap_item: associated rmap_item of page */ -static bool should_skip_rmap_item(struct page *page, +static bool should_skip_rmap_item(struct folio *folio, struct ksm_rmap_item *rmap_item) { rmap_age_t age; @@ -2418,7 +2418,7 @@ static bool should_skip_rmap_item(struct page *page, * will essentially ignore them, but we still have to process them * properly. */ - if (PageKsm(page)) + if (folio_test_ksm(folio)) return false; age = rmap_item->age; @@ -2561,7 +2561,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) ksm_scan.rmap_list = &rmap_item->rmap_list; - if (should_skip_rmap_item(tmp_page, rmap_item)) { + if (should_skip_rmap_item(folio, rmap_item)) { folio_put(folio); goto next_page; } From b33cc96c7020b923085046e5cf2e934f41c530ec Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:30 +0100 Subject: [PATCH 045/215] mm: add PageAnonNotKsm() Check that this anonymous page is really anonymous, not anonymous-or-KSM. This optimises the debug check, but its real purpose is to remove the last two users of PageKsm(). [willy@infradead.org: fix assertions] Link: https://lkml.kernel.org/r/ZwApWPER7caIA_N3@casper.infradead.org Link: https://lkml.kernel.org/r/20241002152533.1350629-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index cc839e4365c18..1fcef06a2d316 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -689,6 +689,13 @@ static __always_inline bool folio_test_anon(const struct folio *folio) return ((unsigned long)folio->mapping & PAGE_MAPPING_ANON) != 0; } +static __always_inline bool PageAnonNotKsm(const struct page *page) +{ + unsigned long flags = (unsigned long)page_folio(page)->mapping; + + return (flags & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_ANON; +} + static __always_inline bool PageAnon(const struct page *page) { return folio_test_anon(page_folio(page)); @@ -1137,14 +1144,14 @@ static __always_inline int PageAnonExclusive(const struct page *page) static __always_inline void SetPageAnonExclusive(struct page *page) { - VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); set_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } static __always_inline void ClearPageAnonExclusive(struct page *page) { - VM_BUG_ON_PGFLAGS(!PageAnon(page) || PageKsm(page), page); + VM_BUG_ON_PGFLAGS(!PageAnonNotKsm(page), page); VM_BUG_ON_PGFLAGS(PageHuge(page) && !PageHead(page), page); clear_bit(PG_anon_exclusive, &PF_ANY(page, 1)->flags); } From b9a256352f3ba697396c26d2a74f4081335f8cef Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:25:31 +0100 Subject: [PATCH 046/215] mm: remove PageKsm() All callers have been converted to use folio_test_ksm() or PageAnonNotKsm(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20241002152533.1350629-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Cc: Alex Shi Signed-off-by: Andrew Morton --- include/linux/page-flags.h | 7 +------ mm/internal.h | 2 +- mm/ksm.c | 4 ++-- 3 files changed, 4 insertions(+), 9 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 1fcef06a2d316..e80665bc51fac 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -725,13 +725,8 @@ static __always_inline bool folio_test_ksm(const struct folio *folio) return ((unsigned long)folio->mapping & PAGE_MAPPING_FLAGS) == PAGE_MAPPING_KSM; } - -static __always_inline bool PageKsm(const struct page *page) -{ - return folio_test_ksm(page_folio(page)); -} #else -TESTPAGEFLAG_FALSE(Ksm, ksm) +FOLIO_TEST_FLAG_FALSE(ksm) #endif u64 stable_page_flags(const struct page *page); diff --git a/mm/internal.h b/mm/internal.h index 64c2eb0b160e1..fc2f523258a36 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1356,7 +1356,7 @@ static inline bool gup_must_unshare(struct vm_area_struct *vma, smp_rmb(); /* - * Note that PageKsm() pages cannot be exclusive, and consequently, + * Note that KSM pages cannot be exclusive, and consequently, * cannot get pinned. */ return !PageAnonExclusive(page); diff --git a/mm/ksm.c b/mm/ksm.c index b1c5c8aff41b7..556b8a8f37d04 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -656,7 +656,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr, bool lock_v * * VM_FAULT_SIGBUS could occur if we race with truncation of the * backing file, which also invalidates anonymous pages: that's - * okay, that truncation will have unmapped the PageKsm for us. + * okay, that truncation will have unmapped the KSM page for us. * * VM_FAULT_OOM: at the time of writing (late July 2009), setting * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the @@ -1434,7 +1434,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page * @page: the PageAnon page that we want to replace with kpage - * @kpage: the PageKsm page that we want to map instead of page, + * @kpage: the KSM page that we want to map instead of page, * or NULL the first time when we want to use page as kpage. * * This function returns 0 if the pages were merged, -EFAULT otherwise. From f0327de7067c008088d96592198ec6df045c5a1b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 2 Oct 2024 16:13:27 +0100 Subject: [PATCH 047/215] gup: convert FOLL_TOUCH case in follow_page_pte() to folio We already have the folio here, so just use it, removing three hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20241002151403.1345296-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Signed-off-by: Andrew Morton --- mm/gup.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 4637dab7b54f1..28ae330ec4dd3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -922,14 +922,14 @@ static struct page *follow_page_pte(struct vm_area_struct *vma, } if (flags & FOLL_TOUCH) { if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); + !pte_dirty(pte) && !folio_test_dirty(folio)) + folio_mark_dirty(folio); /* * pte_mkyoung() would be more correct here, but atomic care * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). + * folio_mark_accessed(). */ - mark_page_accessed(page); + folio_mark_accessed(folio); } out: pte_unmap_unlock(ptep, ptl); From d7d65b1039019e8789119b498d97cf2531d989a8 Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Thu, 3 Oct 2024 10:18:42 +0530 Subject: [PATCH 048/215] mm: move set_pxd_safe() helpers from generic to platform set_pxd_safe() helpers that serve a specific purpose for both x86 and riscv platforms, do not need to be in the common memory code. Otherwise they just unnecessarily make the common API more complicated. This moves the helpers from common code to platform instead. Link: https://lkml.kernel.org/r/20241003044842.246016-1-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Suggested-by: David Hildenbrand Acked-by: Dave Hansen Acked-by: David Hildenbrand Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Thomas Gleixner Cc: David Hildenbrand Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/riscv/include/asm/pgtable.h | 19 ++++++++++++++++ arch/x86/include/asm/pgtable.h | 37 +++++++++++++++++++++++++++++++ include/linux/pgtable.h | 38 -------------------------------- 3 files changed, 56 insertions(+), 38 deletions(-) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index e79f15293492d..5d7f3e8c2e508 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -963,6 +963,25 @@ void misc_mem_init(void); extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) #endif /* !__ASSEMBLY__ */ #endif /* _ASM_RISCV_PGTABLE_H */ diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 4c2d080d26b4f..593f10aabd45a 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1775,6 +1775,43 @@ bool arch_is_platform_page(u64 paddr); #define arch_is_platform_page arch_is_platform_page #endif +/* + * Use set_p*_safe(), and elide TLB flushing, when confident that *no* + * TLB flush will be required as a result of the "set". For example, use + * in scenarios where it is known ahead of time that the routine is + * setting non-present entries, or re-setting an existing entry to the + * same value. Otherwise, use the typical "set" helpers and flush the + * TLB. + */ +#define set_pte_safe(ptep, pte) \ +({ \ + WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ + set_pte(ptep, pte); \ +}) + +#define set_pmd_safe(pmdp, pmd) \ +({ \ + WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ + set_pmd(pmdp, pmd); \ +}) + +#define set_pud_safe(pudp, pud) \ +({ \ + WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ + set_pud(pudp, pud); \ +}) + +#define set_p4d_safe(p4dp, p4d) \ +({ \ + WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ + set_p4d(p4dp, p4d); \ +}) + +#define set_pgd_safe(pgdp, pgd) \ +({ \ + WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ + set_pgd(pgdp, pgd); \ +}) #endif /* __ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e8b2ac6bd2ae3..23aeffd89a4e0 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -1056,44 +1056,6 @@ static inline int pgd_same(pgd_t pgd_a, pgd_t pgd_b) } #endif -/* - * Use set_p*_safe(), and elide TLB flushing, when confident that *no* - * TLB flush will be required as a result of the "set". For example, use - * in scenarios where it is known ahead of time that the routine is - * setting non-present entries, or re-setting an existing entry to the - * same value. Otherwise, use the typical "set" helpers and flush the - * TLB. - */ -#define set_pte_safe(ptep, pte) \ -({ \ - WARN_ON_ONCE(pte_present(*ptep) && !pte_same(*ptep, pte)); \ - set_pte(ptep, pte); \ -}) - -#define set_pmd_safe(pmdp, pmd) \ -({ \ - WARN_ON_ONCE(pmd_present(*pmdp) && !pmd_same(*pmdp, pmd)); \ - set_pmd(pmdp, pmd); \ -}) - -#define set_pud_safe(pudp, pud) \ -({ \ - WARN_ON_ONCE(pud_present(*pudp) && !pud_same(*pudp, pud)); \ - set_pud(pudp, pud); \ -}) - -#define set_p4d_safe(p4dp, p4d) \ -({ \ - WARN_ON_ONCE(p4d_present(*p4dp) && !p4d_same(*p4dp, p4d)); \ - set_p4d(p4dp, p4d); \ -}) - -#define set_pgd_safe(pgdp, pgd) \ -({ \ - WARN_ON_ONCE(pgd_present(*pgdp) && !pgd_same(*pgdp, pgd)); \ - set_pgd(pgdp, pgd); \ -}) - #ifndef __HAVE_ARCH_DO_SWAP_PAGE static inline void arch_do_swap_page_nr(struct mm_struct *mm, struct vm_area_struct *vma, From e26060d1fbd31a8583e2e79addc772249c1e22b4 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 15:58:22 -0700 Subject: [PATCH 049/215] mm: swap: make some count_mthp_stat() call-sites be THP-agnostic. In commit 246d3aa3e531 ("mm: cleanup count_mthp_stat() definition"), Ryan Roberts has pointed out the merits of mm code that does not require THP, to be compile-able without requiring THP ifdefs. As a step in that direction, he has moved count_mthp_stat() to be always defined, resolving to a no-op if THP is not defined. Barry Song referred me to Ryan's commit when I was working on the "mm: zswap swap-out of large folios" patch-series [1]. This patch propagates the benefits of the above change to page_io.c and vmscan.c. As a result, there is one less reason to have the ifdef THP in these code sections. [1]: https://patchwork.kernel.org/project/linux-mm/list/?series=894347 Link: https://lkml.kernel.org/r/20241002225822.9006-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Ryan Roberts Cc: Wajdi Feghali Cc: Yosry Ahmed Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- mm/page_io.c | 2 +- mm/vmscan.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index 40392782cdcb9..ef42651d5ef89 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -288,8 +288,8 @@ static inline void count_swpout_vm_event(struct folio *folio) count_memcg_folio_events(folio, THP_SWPOUT, 1); count_vm_event(THP_SWPOUT); } - count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); #endif + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPOUT); count_memcg_folio_events(folio, PSWPOUT, folio_nr_pages(folio)); count_vm_events(PSWPOUT, folio_nr_pages(folio)); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 8f25dd6cec54b..6a3c498383fa1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1258,8 +1258,8 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, THP_SWPOUT_FALLBACK, 1); count_vm_event(THP_SWPOUT_FALLBACK); } - count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); #endif + count_mthp_stat(order, MTHP_STAT_SWPOUT_FALLBACK); if (!add_to_swap(folio)) goto activate_locked_split; } From 5f5a3e9530beccce4564143eae1518dc5468bb9b Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 2 Oct 2024 15:51:50 -0700 Subject: [PATCH 050/215] mm/truncate: reset xa_has_values flag on each iteration Currently mapping_try_invalidate() and invalidate_inode_pages2_range() traverses the xarray in batches and then for each batch, maintains and sets the flag named xa_has_values if the batch has a shadow entry to clear the entries at the end of the iteration. However they forgot to reset the flag at the end of the iteration which causes them to always try to clear the shadow entries in the subsequent iterations where there might not be any shadow entries. Fix this inefficiency. Link: https://lkml.kernel.org/r/20241002225150.2334504-1-shakeel.butt@linux.dev Fixes: 61c663e020d2 ("mm/truncate: batch-clear shadow entries") Signed-off-by: Shakeel Butt Acked-by: Yu Zhao Cc: Johannes Weiner Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/truncate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index 520c8cf8f58f4..e5151703ba04a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -463,10 +463,10 @@ unsigned long mapping_try_invalidate(struct address_space *mapping, unsigned long ret; unsigned long count = 0; int i; - bool xa_has_values = false; folio_batch_init(&fbatch); while (find_lock_entries(mapping, &index, end, &fbatch, indices)) { + bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { @@ -592,7 +592,6 @@ int invalidate_inode_pages2_range(struct address_space *mapping, int ret = 0; int ret2 = 0; int did_range_unmap = 0; - bool xa_has_values = false; if (mapping_empty(mapping)) return 0; @@ -600,6 +599,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, folio_batch_init(&fbatch); index = start; while (find_get_entries(mapping, &index, end, &fbatch, indices)) { + bool xa_has_values = false; int nr = folio_batch_count(&fbatch); for (i = 0; i < nr; i++) { From b314e21596a48d21a88b8c6a98ecfea8d7b2d2a1 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 7 Oct 2024 12:53:35 +0100 Subject: [PATCH 051/215] maple_tree: do not hash pointers on dump in debug mode Many maple tree values output when an mt_validate() or equivalent hits an issue utilise tagged pointers, most notably parent nodes. Also some pivots/slots contain meaningful values, output as pointers, such as the index of the last entry with data for example. All pointer values such as this are destroyed by kernel pointer hashing rendering the debug output obtained from CONFIG_DEBUG_VM_MAPLE_TREE considerably less usable. Update this code to output the raw pointers using %px rather than %p when CONFIG_DEBUG_VM_MAPLE_TREE is defined. This is justified, as the use of this configuration flag indicates that this is a test environment. Userland does not understand %px, so use %p there. In an abundance of caution, if CONFIG_DEBUG_VM_MAPLE_TREE is not set, also use %p to avoid exposing raw kernel pointers except when we are positive a testing mode is enabled. This was inspired by the investigation performed in recent debugging efforts around a maple tree regression [0] where kernel pointer tagging had to be disabled in order to obtain truly meaningful and useful data. [0]:https://lore.kernel.org/all/20241001023402.3374-1-spasswolf@web.de/ Link: https://lkml.kernel.org/r/20241007115335.90104-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Signed-off-by: Andrew Morton --- lib/maple_tree.c | 100 ++++++++++++++++++++++++++++------------------- 1 file changed, 59 insertions(+), 41 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c5987244ff636..b3b1d4b8126b4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -64,6 +64,21 @@ #define CREATE_TRACE_POINTS #include +/* + * Kernel pointer hashing renders much of the maple tree dump useless as tagged + * pointers get hashed to arbitrary values. + * + * If CONFIG_DEBUG_VM_MAPLE_TREE is set we are in a debug mode where it is + * permissible to bypass this. Otherwise remain cautious and retain the hashing. + * + * Userland doesn't know about %px so also use %p there. + */ +#if defined(__KERNEL__) && defined(CONFIG_DEBUG_VM_MAPLE_TREE) +#define PTR_FMT "%px" +#else +#define PTR_FMT "%p" +#endif + #define MA_ROOT_PARENT 1 /* @@ -5414,7 +5429,8 @@ void *mas_store(struct ma_state *mas, void *entry) trace_ma_write(__func__, mas, 0, entry); #ifdef CONFIG_DEBUG_MAPLE_TREE if (MAS_WARN_ON(mas, mas->index > mas->last)) - pr_err("Error %lX > %lX %p\n", mas->index, mas->last, entry); + pr_err("Error %lX > %lX " PTR_FMT "\n", mas->index, mas->last, + entry); if (mas->index > mas->last) { mas_set_err(mas, -EINVAL); @@ -7119,14 +7135,14 @@ static void mt_dump_entry(void *entry, unsigned long min, unsigned long max, mt_dump_range(min, max, depth, format); if (xa_is_value(entry)) - pr_cont("value %ld (0x%lx) [%p]\n", xa_to_value(entry), - xa_to_value(entry), entry); + pr_cont("value %ld (0x%lx) [" PTR_FMT "]\n", xa_to_value(entry), + xa_to_value(entry), entry); else if (xa_is_zero(entry)) pr_cont("zero (%ld)\n", xa_to_internal(entry)); else if (mt_is_reserved(entry)) - pr_cont("UNKNOWN ENTRY (%p)\n", entry); + pr_cont("UNKNOWN ENTRY (" PTR_FMT ")\n", entry); else - pr_cont("%p\n", entry); + pr_cont(PTR_FMT "\n", entry); } static void mt_dump_range64(const struct maple_tree *mt, void *entry, @@ -7142,13 +7158,13 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, for (i = 0; i < MAPLE_RANGE64_SLOTS - 1; i++) { switch(format) { case mt_dump_hex: - pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } - pr_cont("%p\n", node->slot[i]); + pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_RANGE64_SLOTS; i++) { unsigned long last = max; @@ -7170,11 +7186,11 @@ static void mt_dump_range64(const struct maple_tree *mt, void *entry, if (last > max) { switch(format) { case mt_dump_hex: - pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } @@ -7204,13 +7220,13 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, for (i = 0; i < MAPLE_ARANGE64_SLOTS - 1; i++) { switch (format) { case mt_dump_hex: - pr_cont("%p %lX ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lX ", node->slot[i], node->pivot[i]); break; case mt_dump_dec: - pr_cont("%p %lu ", node->slot[i], node->pivot[i]); + pr_cont(PTR_FMT " %lu ", node->slot[i], node->pivot[i]); } } - pr_cont("%p\n", node->slot[i]); + pr_cont(PTR_FMT "\n", node->slot[i]); for (i = 0; i < MAPLE_ARANGE64_SLOTS; i++) { unsigned long last = max; @@ -7229,11 +7245,11 @@ static void mt_dump_arange64(const struct maple_tree *mt, void *entry, if (last > max) { switch(format) { case mt_dump_hex: - pr_err("node %p last (%lx) > max (%lx) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lx) > max (%lx) at pivot %d!\n", node, last, max, i); break; case mt_dump_dec: - pr_err("node %p last (%lu) > max (%lu) at pivot %d!\n", + pr_err("node " PTR_FMT " last (%lu) > max (%lu) at pivot %d!\n", node, last, max, i); } } @@ -7251,8 +7267,8 @@ static void mt_dump_node(const struct maple_tree *mt, void *entry, mt_dump_range(min, max, depth, format); - pr_cont("node %p depth %d type %d parent %p", node, depth, type, - node ? node->parent : NULL); + pr_cont("node " PTR_FMT " depth %d type %d parent " PTR_FMT, node, + depth, type, node ? node->parent : NULL); switch (type) { case maple_dense: pr_cont("\n"); @@ -7280,7 +7296,7 @@ void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) { void *entry = rcu_dereference_check(mt->ma_root, mt_locked(mt)); - pr_info("maple_tree(%p) flags %X, height %u root %p\n", + pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); if (!xa_is_node(entry)) mt_dump_entry(entry, 0, 0, 0, format); @@ -7332,7 +7348,7 @@ static void mas_validate_gaps(struct ma_state *mas) MT_BUG_ON(mas->tree, !entry); if (gap > p_end - p_start + 1) { - pr_err("%p[%u] %lu >= %lu - %lu + 1 (%lu)\n", + pr_err(PTR_FMT "[%u] %lu >= %lu - %lu + 1 (%lu)\n", mas_mn(mas), i, gap, p_end, p_start, p_end - p_start + 1); MT_BUG_ON(mas->tree, gap > p_end - p_start + 1); @@ -7352,19 +7368,19 @@ static void mas_validate_gaps(struct ma_state *mas) MT_BUG_ON(mas->tree, !gaps); offset = ma_meta_gap(node); if (offset > i) { - pr_err("gap offset %p[%u] is invalid\n", node, offset); + pr_err("gap offset " PTR_FMT "[%u] is invalid\n", node, offset); MT_BUG_ON(mas->tree, 1); } if (gaps[offset] != max_gap) { - pr_err("gap %p[%u] is not the largest gap %lu\n", + pr_err("gap " PTR_FMT "[%u] is not the largest gap %lu\n", node, offset, max_gap); MT_BUG_ON(mas->tree, 1); } for (i++ ; i < mt_slot_count(mte); i++) { if (gaps[i] != 0) { - pr_err("gap %p[%u] beyond node limit != 0\n", + pr_err("gap " PTR_FMT "[%u] beyond node limit != 0\n", node, i); MT_BUG_ON(mas->tree, 1); } @@ -7378,7 +7394,7 @@ static void mas_validate_gaps(struct ma_state *mas) p_mn = mte_parent(mte); MT_BUG_ON(mas->tree, max_gap > mas->max); if (ma_gaps(p_mn, mas_parent_type(mas, mte))[p_slot] != max_gap) { - pr_err("gap %p[%u] != %lu\n", p_mn, p_slot, max_gap); + pr_err("gap " PTR_FMT "[%u] != %lu\n", p_mn, p_slot, max_gap); mt_dump(mas->tree, mt_dump_hex); MT_BUG_ON(mas->tree, 1); } @@ -7408,11 +7424,11 @@ static void mas_validate_parent_slot(struct ma_state *mas) node = mas_slot(mas, slots, i); if (i == p_slot) { if (node != mas->node) - pr_err("parent %p[%u] does not have %p\n", + pr_err("parent " PTR_FMT "[%u] does not have " PTR_FMT "\n", parent, i, mas_mn(mas)); MT_BUG_ON(mas->tree, node != mas->node); } else if (node == mas->node) { - pr_err("Invalid child %p at parent %p[%u] p_slot %u\n", + pr_err("Invalid child " PTR_FMT " at parent " PTR_FMT "[%u] p_slot %u\n", mas_mn(mas), parent, i, p_slot); MT_BUG_ON(mas->tree, node == mas->node); } @@ -7434,20 +7450,20 @@ static void mas_validate_child_slot(struct ma_state *mas) child = mas_slot(mas, slots, i); if (!child) { - pr_err("Non-leaf node lacks child at %p[%u]\n", + pr_err("Non-leaf node lacks child at " PTR_FMT "[%u]\n", mas_mn(mas), i); MT_BUG_ON(mas->tree, 1); } if (mte_parent_slot(child) != i) { - pr_err("Slot error at %p[%u]: child %p has pslot %u\n", + pr_err("Slot error at " PTR_FMT "[%u]: child " PTR_FMT " has pslot %u\n", mas_mn(mas), i, mte_to_node(child), mte_parent_slot(child)); MT_BUG_ON(mas->tree, 1); } if (mte_parent(child) != mte_to_node(mas->node)) { - pr_err("child %p has parent %p not %p\n", + pr_err("child " PTR_FMT " has parent " PTR_FMT " not " PTR_FMT "\n", mte_to_node(child), mte_parent(child), mte_to_node(mas->node)); MT_BUG_ON(mas->tree, 1); @@ -7477,24 +7493,24 @@ static void mas_validate_limits(struct ma_state *mas) piv = mas_safe_pivot(mas, pivots, i, type); if (!piv && (i != 0)) { - pr_err("Missing node limit pivot at %p[%u]", + pr_err("Missing node limit pivot at " PTR_FMT "[%u]", mas_mn(mas), i); MAS_WARN_ON(mas, 1); } if (prev_piv > piv) { - pr_err("%p[%u] piv %lu < prev_piv %lu\n", + pr_err(PTR_FMT "[%u] piv %lu < prev_piv %lu\n", mas_mn(mas), i, piv, prev_piv); MAS_WARN_ON(mas, piv < prev_piv); } if (piv < mas->min) { - pr_err("%p[%u] %lu < %lu\n", mas_mn(mas), i, + pr_err(PTR_FMT "[%u] %lu < %lu\n", mas_mn(mas), i, piv, mas->min); MAS_WARN_ON(mas, piv < mas->min); } if (piv > mas->max) { - pr_err("%p[%u] %lu > %lu\n", mas_mn(mas), i, + pr_err(PTR_FMT "[%u] %lu > %lu\n", mas_mn(mas), i, piv, mas->max); MAS_WARN_ON(mas, piv > mas->max); } @@ -7504,7 +7520,7 @@ static void mas_validate_limits(struct ma_state *mas) } if (mas_data_end(mas) != i) { - pr_err("node%p: data_end %u != the last slot offset %u\n", + pr_err("node" PTR_FMT ": data_end %u != the last slot offset %u\n", mas_mn(mas), mas_data_end(mas), i); MT_BUG_ON(mas->tree, 1); } @@ -7513,8 +7529,8 @@ static void mas_validate_limits(struct ma_state *mas) void *entry = mas_slot(mas, slots, i); if (entry && (i != mt_slots[type] - 1)) { - pr_err("%p[%u] should not have entry %p\n", mas_mn(mas), - i, entry); + pr_err(PTR_FMT "[%u] should not have entry " PTR_FMT "\n", + mas_mn(mas), i, entry); MT_BUG_ON(mas->tree, entry != NULL); } @@ -7524,7 +7540,7 @@ static void mas_validate_limits(struct ma_state *mas) if (!piv) continue; - pr_err("%p[%u] should not have piv %lu\n", + pr_err(PTR_FMT "[%u] should not have piv %lu\n", mas_mn(mas), i, piv); MAS_WARN_ON(mas, i < mt_pivots[type] - 1); } @@ -7549,7 +7565,7 @@ static void mt_validate_nulls(struct maple_tree *mt) do { entry = mas_slot(&mas, slots, offset); if (!last && !entry) { - pr_err("Sequential nulls end at %p[%u]\n", + pr_err("Sequential nulls end at " PTR_FMT "[%u]\n", mas_mn(&mas), offset); } MT_BUG_ON(mt, !last && !entry); @@ -7591,7 +7607,8 @@ void mt_validate(struct maple_tree *mt) end = mas_data_end(&mas); if (MAS_WARN_ON(&mas, (end < mt_min_slot_count(mas.node)) && (mas.max != ULONG_MAX))) { - pr_err("Invalid size %u of %p\n", end, mas_mn(&mas)); + pr_err("Invalid size %u of " PTR_FMT "\n", + end, mas_mn(&mas)); } mas_validate_parent_slot(&mas); @@ -7607,7 +7624,8 @@ EXPORT_SYMBOL_GPL(mt_validate); void mas_dump(const struct ma_state *mas) { - pr_err("MAS: tree=%p enode=%p ", mas->tree, mas->node); + pr_err("MAS: tree=" PTR_FMT " enode=" PTR_FMT " ", + mas->tree, mas->node); switch (mas->status) { case ma_active: pr_err("(ma_active)"); @@ -7671,7 +7689,7 @@ void mas_dump(const struct ma_state *mas) pr_err("[%u/%u] index=%lx last=%lx\n", mas->offset, mas->end, mas->index, mas->last); - pr_err(" min=%lx max=%lx alloc=%p, depth=%u, flags=%x\n", + pr_err(" min=%lx max=%lx alloc=" PTR_FMT ", depth=%u, flags=%x\n", mas->min, mas->max, mas->alloc, mas->depth, mas->mas_flags); if (mas->index > mas->last) pr_err("Check index & last\n"); @@ -7680,7 +7698,7 @@ EXPORT_SYMBOL_GPL(mas_dump); void mas_wr_dump(const struct ma_wr_state *wr_mas) { - pr_err("WR_MAS: node=%p r_min=%lx r_max=%lx\n", + pr_err("WR_MAS: node=" PTR_FMT " r_min=%lx r_max=%lx\n", wr_mas->node, wr_mas->r_min, wr_mas->r_max); pr_err(" type=%u off_end=%u, node_end=%u, end_piv=%lx\n", wr_mas->type, wr_mas->offset_end, wr_mas->mas->end, From 04f315a7dc43a097050534679600974592494a22 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 4 Oct 2024 09:48:31 -0700 Subject: [PATCH 052/215] mm: remove misleading 'unlikely' hint in vms_gather_munmap_vmas() Performance analysis using branch annotation on a fleet of 200 hosts running web servers revealed that the 'unlikely' hint in vms_gather_munmap_vmas() was 100% consistently incorrect. In all observed cases, the branch behavior contradicted the hint. Remove the 'unlikely' qualifier from the condition checking 'vms->uf'. By doing so, we allow the compiler to make optimization decisions based on its own heuristics and profiling data, rather than relying on a static hint that has proven to be inaccurate in real-world scenarios. Link: https://lkml.kernel.org/r/20241004164832.218681-1-leitao@debian.org Signed-off-by: Breno Leitao Reviewed-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/vma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 7621384d64cf5..bb7cfa2dc2827 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1254,7 +1254,7 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, else if (is_data_mapping(next->vm_flags)) vms->data_vm += nrpages; - if (unlikely(vms->uf)) { + if (vms->uf) { /* * If userfaultfd_unmap_prep returns an error the vmas * will remain split, but userland will get a From 7f24cbc9c4d42db8a3c8484d120cf9c1da557fab Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:29 +0200 Subject: [PATCH 053/215] mm/mmap: teach generic_get_unmapped_area{_topdown} to handle hugetlb mappings Patch series "Unify hugetlb into arch_get_unmapped_area functions", v4. This is an attempt to get rid of a fair amount of duplicated code wrt. hugetlb and *get_unmapped_area* functions. HugeTLB registers a .get_unmapped_area function which gets called from __get_unmapped_area(). hugetlb_get_unmapped_area() is defined by a bunch of architectures and it also has a generic definition for those that do not define it. Short-long story is that there is a ton of duplicated code between specific hugetlb *_get_unmapped_area_* functions and mm-core functions, so we can do better by teaching arch_get_unmapped_area* functions how to deal with hugetlb mappings. Note that not a lot of things need to be taught though. hugetlb_get_unmapped_area, that gets called for hugetlb mappings, runs some sanity checks prior to calling mm_get_unmapped_area_vmflags(), so we do not need to that down the road in the respective {generic,arch}_get_unmapped_area* functions. More information can be found in the respective patches. LTP mmapstress hugetlb selftests were ran succesfully on: This patch (of 9): We want to stop special casing hugetlb mappings and make them go through generic channels, so teach generic_get_unmapped_area{_topdown} to handle those. The main difference is that we set info.align_mask for huge mappings. Link: https://lkml.kernel.org/r/20241007075037.267650-1-osalvador@suse.de Link: https://lkml.kernel.org/r/20241007075037.267650-2-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 10 ++++++++++ mm/mmap.c | 4 ++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index e4697539b665a..368d552e4860c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -1035,9 +1035,19 @@ void hugetlb_unregister_node(struct node *node); */ bool is_raw_hwpoison_page_in_hugepage(struct page *page); +static inline unsigned long huge_page_mask_align(struct file *file) +{ + return PAGE_MASK & ~huge_page_mask(hstate_file(file)); +} + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; +static inline unsigned long huge_page_mask_align(struct file *file) +{ + return 0; +} + static inline struct hugepage_subpool *hugetlb_folio_subpool(struct folio *folio) { return NULL; diff --git a/mm/mmap.c b/mm/mmap.c index 79d541f1502b2..fb91b2cb55615 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -776,6 +776,8 @@ generic_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = mm->mmap_base; info.high_limit = mmap_end; info.start_gap = stack_guard_placement(vm_flags); + if (filp && is_file_hugepages(filp)) + info.align_mask = huge_page_mask_align(filp); return vm_unmapped_area(&info); } @@ -826,6 +828,8 @@ generic_get_unmapped_area_topdown(struct file *filp, unsigned long addr, info.low_limit = PAGE_SIZE; info.high_limit = arch_get_mmap_base(addr, mm->mmap_base); info.start_gap = stack_guard_placement(vm_flags); + if (filp && is_file_hugepages(filp)) + info.align_mask = huge_page_mask_align(filp); addr = vm_unmapped_area(&info); /* From 7d7dba7f6891addedeb894e6f74b46177900874c Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:30 +0200 Subject: [PATCH 054/215] arch/s390: teach arch_get_unmapped_area{_topdown} to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area{_topdown} to handle those. s390 specific hugetlb function does not set info.align_offset, so do the same here for compatibility. Link: https://lkml.kernel.org/r/20241007075037.267650-3-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/mm/mmap.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/s390/mm/mmap.c b/arch/s390/mm/mmap.c index 96efa061ce01b..33f3504be90b5 100644 --- a/arch/s390/mm/mmap.c +++ b/arch/s390/mm/mmap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include static unsigned long stack_maxrandom_size(void) @@ -73,6 +74,8 @@ static inline unsigned long mmap_base(unsigned long rnd, static int get_align_mask(struct file *filp, unsigned long flags) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); if (!(current->flags & PF_RANDOMIZE)) return 0; if (filp || (flags & MAP_SHARED)) @@ -106,7 +109,8 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, info.low_limit = mm->mmap_base; info.high_limit = TASK_SIZE; info.align_mask = get_align_mask(filp, flags); - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if (offset_in_page(addr)) return addr; @@ -144,7 +148,8 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, unsigned long ad info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; info.align_mask = get_align_mask(filp, flags); - info.align_offset = pgoff << PAGE_SHIFT; + if (!(filp && is_file_hugepages(filp))) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* From 1317a5e7f7b1336eac4097f0ef4f5cd7ae72f1d0 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:31 +0200 Subject: [PATCH 055/215] arch/x86: teach arch_get_unmapped_area_vmflags to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area_{topdown_}vmflags to handle those. x86 specific hugetlb function does not set either info.start_gap or info.align_offset so the same here for compatibility. Link: https://lkml.kernel.org/r/20241007075037.267650-4-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/sys_x86_64.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 87f8c9a71c496..776ae6fa7f2d6 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include @@ -25,8 +26,10 @@ /* * Align a virtual address to avoid aliasing in the I$ on AMD F15h. */ -static unsigned long get_align_mask(void) +static unsigned long get_align_mask(struct file *filp) { + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); /* handle 32- and 64-bit case with a single conditional */ if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) return 0; @@ -49,7 +52,7 @@ static unsigned long get_align_mask(void) */ static unsigned long get_align_bits(void) { - return va_align.bits & get_align_mask(); + return va_align.bits & get_align_mask(NULL); } static int __init control_va_addr_alignment(char *str) @@ -148,12 +151,15 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_offset = pgoff << PAGE_SHIFT; - info.start_gap = stack_guard_placement(vm_flags); + if (!(filp && is_file_hugepages(filp))) { + info.align_offset = pgoff << PAGE_SHIFT; + info.start_gap = stack_guard_placement(vm_flags); + } if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } + return vm_unmapped_area(&info); } @@ -199,7 +205,10 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, info.low_limit = PAGE_SIZE; info.high_limit = get_mmap_base(0); - info.start_gap = stack_guard_placement(vm_flags); + if (!(filp && is_file_hugepages(filp))) { + info.start_gap = stack_guard_placement(vm_flags); + info.align_offset = pgoff << PAGE_SHIFT; + } /* * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area @@ -211,9 +220,8 @@ arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr0, if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - info.align_offset = pgoff << PAGE_SHIFT; if (filp) { - info.align_mask = get_align_mask(); + info.align_mask = get_align_mask(filp); info.align_offset += get_align_bits(); } addr = vm_unmapped_area(&info); From a8d457b29b017a8499ff885d64804de3ff203dee Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:32 +0200 Subject: [PATCH 056/215] arch/sparc: teach arch_get_unmapped_area{_topdown} to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area{_topdown} to handle those. sparc specific hugetlb function does not set info.align_offset, and does not care about adjusting the align_mask for MAP_SHARED cases, so the same here for compatibility. Link: https://lkml.kernel.org/r/20241007075037.267650-5-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/sparc/kernel/sys_sparc_32.c | 17 +++++++++++---- arch/sparc/kernel/sys_sparc_64.c | 37 +++++++++++++++++++++++++------- 2 files changed, 42 insertions(+), 12 deletions(-) diff --git a/arch/sparc/kernel/sys_sparc_32.c b/arch/sparc/kernel/sys_sparc_32.c index 80822f922e767..fb31bc0c5b488 100644 --- a/arch/sparc/kernel/sys_sparc_32.c +++ b/arch/sparc/kernel/sys_sparc_32.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -42,12 +43,16 @@ SYSCALL_DEFINE0(getpagesize) unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -62,9 +67,13 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = addr; info.high_limit = TASK_SIZE; - info.align_mask = (flags & MAP_SHARED) ? - (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + if (!file_hugepage) { + info.align_mask = (flags & MAP_SHARED) ? + (PAGE_MASK & (SHMLBA - 1)) : 0; + info.align_offset = pgoff << PAGE_SHIFT; + } else { + info.align_mask = huge_page_mask_align(filp); + } return vm_unmapped_area(&info); } diff --git a/arch/sparc/kernel/sys_sparc_64.c b/arch/sparc/kernel/sys_sparc_64.c index acade309dc2fb..c5a284df7b417 100644 --- a/arch/sparc/kernel/sys_sparc_64.c +++ b/arch/sparc/kernel/sys_sparc_64.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -87,6 +88,16 @@ static inline unsigned long COLOR_ALIGN(unsigned long addr, return base + off; } +static unsigned long get_align_mask(struct file *filp, unsigned long flags) +{ + if (filp && is_file_hugepages(filp)) + return huge_page_mask_align(filp); + if (filp || (flags & MAP_SHARED)) + return PAGE_MASK & (SHMLBA - 1); + + return 0; +} + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags, vm_flags_t vm_flags) { struct mm_struct *mm = current->mm; @@ -94,12 +105,16 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi unsigned long task_size = TASK_SIZE; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; + + if (filp && is_file_hugepages(filp)) + file_hugepage = true; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -111,7 +126,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; if (addr) { @@ -129,8 +144,9 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi info.length = len; info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { @@ -154,15 +170,19 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, unsigned long addr = addr0; int do_color_align; struct vm_unmapped_area_info info = {}; + bool file_hugepage = false; /* This should only ever run for 32-bit processes. */ BUG_ON(!test_thread_flag(TIF_32BIT)); + if (filp && is_file_hugepages(filp)) + file_hugepage = true; + if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate * cache aliasing constraints. */ - if ((flags & MAP_SHARED) && + if (!file_hugepage && (flags & MAP_SHARED) && ((addr - (pgoff << PAGE_SHIFT)) & (SHMLBA - 1))) return -EINVAL; return addr; @@ -172,7 +192,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return -ENOMEM; do_color_align = 0; - if (filp || (flags & MAP_SHARED)) + if ((filp || (flags & MAP_SHARED)) && !file_hugepage) do_color_align = 1; /* requesting a specific address */ @@ -192,8 +212,9 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = do_color_align ? (PAGE_MASK & (SHMLBA - 1)) : 0; - info.align_offset = pgoff << PAGE_SHIFT; + info.align_mask = get_align_mask(filp, flags); + if (!file_hugepage) + info.align_offset = pgoff << PAGE_SHIFT; addr = vm_unmapped_area(&info); /* From 5959ffabbb67dfdd0cd4623e675a24005de66393 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:33 +0200 Subject: [PATCH 057/215] arch/powerpc: teach book3s64 arch_get_unmapped_area{_topdown} to handle hugetlb mappings We want to stop special casing hugetlb mappings and make them go through generic channels, so teach arch_get_unmapped_area{_topdown} to handle those. Reshuffle file_to_psize() definition so arch_get_unmapped_area{_topdown} can make use of it. Link: https://lkml.kernel.org/r/20241007075037.267650-6-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/powerpc/mm/book3s64/slice.c | 40 ++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index 87307d0fc3b81..3a858f6b72701 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -633,6 +633,20 @@ unsigned long slice_get_unmapped_area(unsigned long addr, unsigned long len, } EXPORT_SYMBOL_GPL(slice_get_unmapped_area); +#ifdef CONFIG_HUGETLB_PAGE +static int file_to_psize(struct file *file) +{ + struct hstate *hstate = hstate_file(file); + + return shift_to_mmu_psize(huge_page_shift(hstate)); +} +#else +static int file_to_psize(struct file *file) +{ + return 0; +} +#endif + unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, @@ -640,11 +654,17 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area(filp, addr, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr, len, flags, - mm_ctx_user_psize(¤t->mm->context), 0); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr, len, flags, psize, 0); } unsigned long arch_get_unmapped_area_topdown(struct file *filp, @@ -654,11 +674,17 @@ unsigned long arch_get_unmapped_area_topdown(struct file *filp, const unsigned long flags, vm_flags_t vm_flags) { + unsigned int psize; + if (radix_enabled()) return generic_get_unmapped_area_topdown(filp, addr0, len, pgoff, flags, vm_flags); - return slice_get_unmapped_area(addr0, len, flags, - mm_ctx_user_psize(¤t->mm->context), 1); + if (filp && is_file_hugepages(filp)) + psize = file_to_psize(filp); + else + psize = mm_ctx_user_psize(¤t->mm->context); + + return slice_get_unmapped_area(addr0, len, flags, psize, 1); } unsigned int notrace get_slice_psize(struct mm_struct *mm, unsigned long addr) @@ -789,12 +815,6 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } -static int file_to_psize(struct file *file) -{ - struct hstate *hstate = hstate_file(file); - return shift_to_mmu_psize(huge_page_shift(hstate)); -} - unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) From 7bd3f1e1a9ae7f7508c88dd056755a0a4741ae88 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:34 +0200 Subject: [PATCH 058/215] mm: make hugetlb mappings go through mm_get_unmapped_area_vmflags Hugetlb mappings will no longer be special cased but rather go through the generic mm_get_unmapped_area_vmflags function. For that to happen, let us remove the .get_unmapped_area from hugetlbfs_file_operations struct, and hint __get_unmapped_area that it should not send hugetlb mappings through thp_get_unmapped_area_vmflags but through mm_get_unmapped_area_vmflags. Create also a function called hugetlb_mmap_check_and_align() where a couple of safety checks are being done and the addr is aligned to the huge page size. Otherwise we will have to do this in every single function, which duplicates quite a lot of code. Link: https://lkml.kernel.org/r/20241007075037.267650-7-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 24 ++++++++++++++++-------- include/linux/hugetlb.h | 9 ++++----- 2 files changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 5cf327337e227..2c5f34e315d25 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -258,15 +258,23 @@ generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, pgoff, flags); } -#ifndef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -static unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) +unsigned long +__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long flags) { - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); + unsigned long addr0 = 0; + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) + return -EINVAL; + if ((flags & MAP_FIXED) && prepare_hugepage_range(file, addr, len)) + return -EINVAL; + if (addr) + addr0 = ALIGN(addr, huge_page_size(h)); + + return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, + flags, 0); } -#endif /* * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. @@ -1300,7 +1308,7 @@ static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, - .get_unmapped_area = hugetlb_get_unmapped_area, + .get_unmapped_area = __hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, .fop_flags = FOP_HUGE_PAGES, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 368d552e4860c..3a81b6126f623 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -546,11 +546,10 @@ static inline struct hstate *hstate_inode(struct inode *i) } #endif /* !CONFIG_HUGETLBFS */ -#ifdef HAVE_ARCH_HUGETLB_UNMAPPED_AREA -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); -#endif /* HAVE_ARCH_HUGETLB_UNMAPPED_AREA */ +unsigned long +__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags); unsigned long generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, From cc92882ee218d62ef017fa545b3c8a2d1e060a5a Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:35 +0200 Subject: [PATCH 059/215] mm: drop hugetlb_get_unmapped_area{_*} functions Hugetlb mappings are now handled through normal channels just like any other mapping, so we no longer need hugetlb_get_unmapped_area* specific functions. Link: https://lkml.kernel.org/r/20241007075037.267650-8-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/parisc/mm/hugetlbpage.c | 21 ------ arch/powerpc/mm/book3s64/slice.c | 10 --- arch/s390/mm/hugetlbpage.c | 85 ------------------------ arch/sparc/mm/hugetlbpage.c | 108 ------------------------------- arch/x86/mm/hugetlbpage.c | 101 ----------------------------- fs/hugetlbfs/inode.c | 96 ++------------------------- include/linux/hugetlb.h | 7 +- 7 files changed, 6 insertions(+), 422 deletions(-) diff --git a/arch/parisc/mm/hugetlbpage.c b/arch/parisc/mm/hugetlbpage.c index aa664f7ddb639..e9d18cf25b792 100644 --- a/arch/parisc/mm/hugetlbpage.c +++ b/arch/parisc/mm/hugetlbpage.c @@ -21,27 +21,6 @@ #include -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE) - return -ENOMEM; - - if (flags & MAP_FIXED) - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - - if (addr) - addr = ALIGN(addr, huge_page_size(h)); - - /* we need to make sure the colouring is OK */ - return arch_get_unmapped_area(file, addr, len, pgoff, flags, 0); -} pte_t *huge_pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/book3s64/slice.c b/arch/powerpc/mm/book3s64/slice.c index 3a858f6b72701..bc9a39821d1c6 100644 --- a/arch/powerpc/mm/book3s64/slice.c +++ b/arch/powerpc/mm/book3s64/slice.c @@ -814,14 +814,4 @@ unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) return 1UL << mmu_psize_to_shift(get_slice_psize(vma->vm_mm, vma->vm_start)); } - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - if (radix_enabled()) - return generic_hugetlb_get_unmapped_area(file, addr, len, pgoff, flags); - - return slice_get_unmapped_area(addr, len, flags, file_to_psize(file), 1); -} #endif diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index ded0eff58a192..7c79cf1bc7d76 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -242,88 +242,3 @@ bool __init arch_hugetlb_valid_size(unsigned long size) else return false; } - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = TASK_SIZE; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr0, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - unsigned long addr; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = current->mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > TASK_SIZE - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - goto check_asce_limit; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma))) - goto check_asce_limit; - } - - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - addr = hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - addr = hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - if (offset_in_page(addr)) - return addr; - -check_asce_limit: - return check_asce_limit(mm, addr, len); -} diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index cc91ca7a1e182..eee601a0d2cfb 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -19,114 +19,6 @@ #include #include -/* Slightly simplified from the non-hugepage variant because by - * definition we don't have to worry about any page coloring stuff - */ - -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *filp, - unsigned long addr, - unsigned long len, - unsigned long pgoff, - unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - unsigned long task_size = TASK_SIZE; - struct vm_unmapped_area_info info = {}; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - info.length = len; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = min(task_size, VA_EXCLUDE_START); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - if ((addr & ~PAGE_MASK) && task_size > VA_EXCLUDE_END) { - VM_BUG_ON(addr != -ENOMEM); - info.low_limit = VA_EXCLUDE_END; - info.high_limit = task_size; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, - const unsigned long len, - const unsigned long pgoff, - const unsigned long flags) -{ - struct hstate *h = hstate_file(filp); - struct mm_struct *mm = current->mm; - unsigned long addr = addr0; - struct vm_unmapped_area_info info = {}; - - /* This should only ever run for 32-bit processes. */ - BUG_ON(!test_thread_flag(TIF_32BIT)); - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = mm->mmap_base; - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = STACK_TOP32; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - unsigned long task_size = TASK_SIZE; - - if (test_thread_flag(TIF_32BIT)) - task_size = STACK_TOP32; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > task_size) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma(mm, addr); - if (task_size - len >= addr && - (!vma || addr + len <= vm_start_gap(vma))) - return addr; - } - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} static pte_t sun4u_hugepage_shift_to_tte(pte_t entry, unsigned int shift) { diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 807a5859a3c4b..58f7f2bd535d5 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -19,107 +19,6 @@ #include #include -#ifdef CONFIG_HUGETLB_PAGE -static unsigned long hugetlb_get_unmapped_area_bottomup(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = get_mmap_base(1); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - info.high_limit = in_32bit_syscall() ? - task_size_32bit() : task_size_64bit(addr > DEFAULT_MAP_WINDOW); - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long hugetlb_get_unmapped_area_topdown(struct file *file, - unsigned long addr, unsigned long len, - unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = get_mmap_base(0); - - /* - * If hint address is above DEFAULT_MAP_WINDOW, look for unmapped area - * in the full address space. - */ - if (addr > DEFAULT_MAP_WINDOW && !in_32bit_syscall()) - info.high_limit += TASK_SIZE_MAX - DEFAULT_MAP_WINDOW; - - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (addr & ~PAGE_MASK) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = TASK_UNMAPPED_BASE; - info.high_limit = TASK_SIZE_LOW; - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; - - if (len & ~huge_page_mask(h)) - return -EINVAL; - - if (len > TASK_SIZE) - return -ENOMEM; - - /* No address checking. See comment at mmap_address_hint_valid() */ - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr &= huge_page_mask(h); - if (!mmap_address_hint_valid(addr, len)) - goto get_unmapped_area; - - vma = find_vma(mm, addr); - if (!vma || addr + len <= vm_start_gap(vma)) - return addr; - } - -get_unmapped_area: - if (!test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); - else - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); -} -#endif /* CONFIG_HUGETLB_PAGE */ #ifdef CONFIG_X86_64 bool __init arch_hugetlb_valid_size(unsigned long size) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2c5f34e315d25..935c0ed3aa1ea 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -171,96 +171,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * Called under mmap_write_lock(mm). */ -static unsigned long -hugetlb_get_unmapped_area_bottomup(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.length = len; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - return vm_unmapped_area(&info); -} - -static unsigned long -hugetlb_get_unmapped_area_topdown(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, unsigned long flags) -{ - struct hstate *h = hstate_file(file); - struct vm_unmapped_area_info info = {}; - - info.flags = VM_UNMAPPED_AREA_TOPDOWN; - info.length = len; - info.low_limit = PAGE_SIZE; - info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base); - info.align_mask = PAGE_MASK & ~huge_page_mask(h); - addr = vm_unmapped_area(&info); - - /* - * A failed mmap() very likely causes application failure, - * so fall back to the bottom-up function here. This scenario - * can happen with large stack limits and large mmap() - * allocations. - */ - if (unlikely(offset_in_page(addr))) { - VM_BUG_ON(addr != -ENOMEM); - info.flags = 0; - info.low_limit = current->mm->mmap_base; - info.high_limit = arch_get_mmap_end(addr, len, flags); - addr = vm_unmapped_area(&info); - } - - return addr; -} - -unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - struct hstate *h = hstate_file(file); - const unsigned long mmap_end = arch_get_mmap_end(addr, len, flags); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (len > mmap_end - mmap_min_addr) - return -ENOMEM; - - if (flags & MAP_FIXED) { - if (prepare_hugepage_range(file, addr, len)) - return -EINVAL; - return addr; - } - - if (addr) { - addr = ALIGN(addr, huge_page_size(h)); - vma = find_vma_prev(mm, addr, &prev); - if (mmap_end - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vm_start_gap(vma)) && - (!prev || addr >= vm_end_gap(prev))) - return addr; - } - - /* - * Use MMF_TOPDOWN flag as a hint to use topdown routine. - * If architectures have special needs, they should define their own - * version of hugetlb_get_unmapped_area. - */ - if (test_bit(MMF_TOPDOWN, &mm->flags)) - return hugetlb_get_unmapped_area_topdown(file, addr, len, - pgoff, flags); - return hugetlb_get_unmapped_area_bottomup(file, addr, len, - pgoff, flags); -} - unsigned long -__hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long flags) +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, + unsigned long len, unsigned long pgoff, + unsigned long flags) { unsigned long addr0 = 0; struct hstate *h = hstate_file(file); @@ -272,7 +186,7 @@ __hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr, len, pgoff, + return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, flags, 0); } @@ -1308,7 +1222,7 @@ static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, .mmap = hugetlbfs_file_mmap, .fsync = noop_fsync, - .get_unmapped_area = __hugetlb_get_unmapped_area, + .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, .fallocate = hugetlbfs_fallocate, .fop_flags = FOP_HUGE_PAGES, diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3a81b6126f623..ae4fe8615bb6e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -547,15 +547,10 @@ static inline struct hstate *hstate_inode(struct inode *i) #endif /* !CONFIG_HUGETLBFS */ unsigned long -__generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, +hugetlb_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags); -unsigned long -generic_hugetlb_get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags); - /* * huegtlb page specific state flags. These flags are located in page.private * of the hugetlb head page. Functions created via the below macros should be From 5b2f650d593ed4d020228df8563e7ad23abc847f Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:36 +0200 Subject: [PATCH 060/215] arch/s390: clean up hugetlb definitions s390 redefines functions that are already defined (and the same) in include/asm-generic/hugetlb.h. Do as the other architectures: 1) include include/asm-generic/hugetlb.h 2) drop the already defined functions in the generic hugetlb.h and 3) use the __HAVE_ARCH_HUGE_* macros to define our own. This gets rid of quite some code. Link: https://lkml.kernel.org/r/20241007075037.267650-9-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/include/asm/hugetlb.h | 58 +++++++++------------------------ include/asm-generic/hugetlb.h | 8 +++++ 2 files changed, 24 insertions(+), 42 deletions(-) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index cf1b5d6fb1a62..37e80a32623a7 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -12,21 +12,24 @@ #include #include -#define hugetlb_free_pgd_range free_pgd_range #define hugepages_supported() (MACHINE_HAS_EDAT1) +#define __HAVE_ARCH_HUGE_SET_HUGE_PTE_AT void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte, unsigned long sz); void __set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); -pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -pte_t huge_ptep_get_and_clear(struct mm_struct *mm, +#define __HAVE_ARCH_HUGE_PTEP_GET +extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep); +#define __HAVE_ARCH_HUGE_PTEP_GET_AND_CLEAR +extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); /* * If the arch doesn't supply something else, assume that hugepage * size aligned regions are ok without further preparation. */ +#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { @@ -45,6 +48,7 @@ static inline void arch_clear_hugetlb_flags(struct folio *folio) } #define arch_clear_hugetlb_flags arch_clear_hugetlb_flags +#define __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep, unsigned long sz) { @@ -54,12 +58,14 @@ static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, set_pte(ptep, __pte(_SEGMENT_ENTRY_EMPTY)); } +#define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { return huge_ptep_get_and_clear(vma->vm_mm, address, ptep); } +#define __HAVE_ARCH_HUGE_PTEP_SET_ACCESS_FLAGS static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t pte, int dirty) @@ -72,6 +78,7 @@ static inline int huge_ptep_set_access_flags(struct vm_area_struct *vma, return changed; } +#define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { @@ -79,69 +86,36 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __set_huge_pte_at(mm, addr, ptep, pte_wrprotect(pte)); } -static inline pte_t mk_huge_pte(struct page *page, pgprot_t pgprot) -{ - return mk_pte(page, pgprot); -} - +#define __HAVE_ARCH_HUGE_PTE_NONE static inline int huge_pte_none(pte_t pte) { return pte_none(pte); } +#define __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte); } -static inline int huge_pte_write(pte_t pte) -{ - return pte_write(pte); -} - -static inline int huge_pte_dirty(pte_t pte) -{ - return pte_dirty(pte); -} - -static inline pte_t huge_pte_mkwrite(pte_t pte) -{ - return pte_mkwrite_novma(pte); -} - -static inline pte_t huge_pte_mkdirty(pte_t pte) -{ - return pte_mkdirty(pte); -} - -static inline pte_t huge_pte_wrprotect(pte_t pte) -{ - return pte_wrprotect(pte); -} - -static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) -{ - return pte_modify(pte, newprot); -} - +#define __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte; } +#define __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return 0; } -static inline bool gigantic_page_runtime_supported(void) -{ - return true; -} +#include #endif /* _ASM_S390_HUGETLB_H */ diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 594d5905f6151..67bbdafcfc224 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -42,20 +42,26 @@ static inline pte_t huge_pte_modify(pte_t pte, pgprot_t newprot) return pte_modify(pte, newprot); } +#ifndef __HAVE_ARCH_HUGE_PTE_MKUFFD_WP static inline pte_t huge_pte_mkuffd_wp(pte_t pte) { return huge_pte_wrprotect(pte_mkuffd_wp(pte)); } +#endif +#ifndef __HAVE_ARCH_HUGE_PTE_CLEAR_UFFD_WP static inline pte_t huge_pte_clear_uffd_wp(pte_t pte) { return pte_clear_uffd_wp(pte); } +#endif +#ifndef __HAVE_ARCH_HUGE_PTE_UFFD_WP static inline int huge_pte_uffd_wp(pte_t pte) { return pte_uffd_wp(pte); } +#endif #ifndef __HAVE_ARCH_HUGE_PTE_CLEAR static inline void huge_pte_clear(struct mm_struct *mm, unsigned long addr, @@ -106,10 +112,12 @@ static inline int huge_pte_none(pte_t pte) #endif /* Please refer to comments above pte_none_mostly() for the usage */ +#ifndef __HAVE_ARCH_HUGE_PTE_NONE_MOSTLY static inline int huge_pte_none_mostly(pte_t pte) { return huge_pte_none(pte) || is_pte_marker(pte); } +#endif #ifndef __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE static inline int prepare_hugepage_range(struct file *file, From bd40b053fabe27209cb240d205a0c817cbe5fb87 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Mon, 7 Oct 2024 09:50:37 +0200 Subject: [PATCH 061/215] mm: consolidate common checks in hugetlb_get_unmapped_area prepare_hugepage_range() performs almost the same checks for all architectures that define it, with the exception of mips and loongarch that also check for overflows. The rest checks for the addr and len to be properly aligned, so we can move that to hugetlb_get_unmapped_area() and get rid of a fair amount of duplicated code. [akpm@linux-foundation.org: remove now-unused local] Link: https://lore.kernel.org/oe-kbuild-all/202410081210.uNLbf3Jk-lkp@intel.com/ Link: https://lkml.kernel.org/r/20241007075037.267650-10-osalvador@suse.de Signed-off-by: Oscar Salvador Cc: David Hildenbrand Cc: Donet Tom Cc: Lorenzo Stoakes Cc: Michal Hocko Cc: Muchun Song Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/hugetlb.h | 5 ----- arch/mips/include/asm/hugetlb.h | 5 ----- arch/parisc/include/asm/hugetlb.h | 15 --------------- arch/s390/include/asm/hugetlb.h | 17 ----------------- arch/sh/include/asm/hugetlb.h | 15 --------------- fs/hugetlbfs/inode.c | 8 ++++++-- include/asm-generic/hugetlb.h | 7 ------- 7 files changed, 6 insertions(+), 66 deletions(-) diff --git a/arch/loongarch/include/asm/hugetlb.h b/arch/loongarch/include/asm/hugetlb.h index 5da32c00d483f..b837c65a4894e 100644 --- a/arch/loongarch/include/asm/hugetlb.h +++ b/arch/loongarch/include/asm/hugetlb.h @@ -16,12 +16,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/mips/include/asm/hugetlb.h b/arch/mips/include/asm/hugetlb.h index fd69c88085542..d0a86ce83de91 100644 --- a/arch/mips/include/asm/hugetlb.h +++ b/arch/mips/include/asm/hugetlb.h @@ -17,12 +17,7 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long len) { unsigned long task_size = STACK_TOP; - struct hstate *h = hstate_file(file); - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; if (len > task_size) return -ENOMEM; if (task_size - len < addr) diff --git a/arch/parisc/include/asm/hugetlb.h b/arch/parisc/include/asm/hugetlb.h index 72daacc472a0a..5b3a5429f71b3 100644 --- a/arch/parisc/include/asm/hugetlb.h +++ b/arch/parisc/include/asm/hugetlb.h @@ -12,21 +12,6 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/arch/s390/include/asm/hugetlb.h b/arch/s390/include/asm/hugetlb.h index 37e80a32623a7..6f815d4ba0cab 100644 --- a/arch/s390/include/asm/hugetlb.h +++ b/arch/s390/include/asm/hugetlb.h @@ -25,23 +25,6 @@ extern pte_t huge_ptep_get(struct mm_struct *mm, unsigned long addr, pte_t *ptep extern pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; -} - static inline void arch_clear_hugetlb_flags(struct folio *folio) { clear_bit(PG_arch_1, &folio->flags); diff --git a/arch/sh/include/asm/hugetlb.h b/arch/sh/include/asm/hugetlb.h index 75028bd568ba5..4a92e6e4d627e 100644 --- a/arch/sh/include/asm/hugetlb.h +++ b/arch/sh/include/asm/hugetlb.h @@ -5,21 +5,6 @@ #include #include -/* - * If the arch doesn't supply something else, assume that hugepage - * size aligned regions are ok without further preparation. - */ -#define __HAVE_ARCH_PREPARE_HUGEPAGE_RANGE -static inline int prepare_hugepage_range(struct file *file, - unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #define __HAVE_ARCH_HUGE_PTEP_CLEAR_FLUSH static inline pte_t huge_ptep_clear_flush(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 935c0ed3aa1ea..c6191a6118b83 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -181,8 +181,12 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (len & ~huge_page_mask(h)) return -EINVAL; - if ((flags & MAP_FIXED) && prepare_hugepage_range(file, addr, len)) - return -EINVAL; + if (flags & MAP_FIXED) { + if (addr & ~huge_page_mask(h)) + return -EINVAL; + if (prepare_hugepage_range(file, addr, len)) + return -EINVAL; + } if (addr) addr0 = ALIGN(addr, huge_page_size(h)); diff --git a/include/asm-generic/hugetlb.h b/include/asm-generic/hugetlb.h index 67bbdafcfc224..f42133dae68e5 100644 --- a/include/asm-generic/hugetlb.h +++ b/include/asm-generic/hugetlb.h @@ -123,13 +123,6 @@ static inline int huge_pte_none_mostly(pte_t pte) static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { - struct hstate *h = hstate_file(file); - - if (len & ~huge_page_mask(h)) - return -EINVAL; - if (addr & ~huge_page_mask(h)) - return -EINVAL; - return 0; } #endif From 018d24539d9ed7531245a381ba24f5d9e8714682 Mon Sep 17 00:00:00 2001 From: Dennis Zhou Date: Mon, 7 Oct 2024 17:19:42 -0700 Subject: [PATCH 062/215] percpu: fix data race with pcpu_nr_empty_pop_pages Fixes the data race by moving the read to be behind the pcpu_lock. This is okay because the code (initially) above it will not increase the empty populated page count because it is populating backing pages that already have allocations served out of them. Link: https://lkml.kernel.org/r/20241008001942.8114-1-dennis@kernel.org Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202407191651.f24e499d-oliver.sang@intel.com Signed-off-by: Dennis Zhou Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton --- mm/percpu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index da21680ff294c..d1a73cf65c532 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1864,6 +1864,10 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, area_found: pcpu_stats_area_alloc(chunk, size); + + if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) + pcpu_schedule_balance_work(); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate if not all pages are already there */ @@ -1891,9 +1895,6 @@ void __percpu *pcpu_alloc_noprof(size_t size, size_t align, bool reserved, mutex_unlock(&pcpu_alloc_mutex); } - if (pcpu_nr_empty_pop_pages < PCPU_EMPTY_POP_PAGES_LOW) - pcpu_schedule_balance_work(); - /* clear the areas and return address relative to base address */ for_each_possible_cpu(cpu) memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); From 077c7c1e099f46b4abd0babf233d476597a4823a Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 7 Oct 2024 18:20:09 -0700 Subject: [PATCH 063/215] mm/memory.c: remove stray newline at top of file Fixes: d61ea1cb0095 ("userfaultfd: UFFD_FEATURE_WP_ASYNC") Reported-by: Jeongjun Park Closes: https://lkml.kernel.org/r/20241007065307.4158-1-aha310510@gmail.com Cc: Muhammad Usama Anjum Cc: Peter Xu Cc: Greg KH Signed-off-by: Andrew Morton --- mm/memory.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 6bda739a60e8b..5e9d6a22eb088 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1,4 +1,3 @@ - // SPDX-License-Identifier: GPL-2.0-only /* * linux/mm/memory.c From 150e0fb86d69caec7aec48705e563e9336b7a4a6 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Tue, 8 Oct 2024 14:23:53 +0100 Subject: [PATCH 064/215] MAINTAINERS: mailmap: update Alexey Klimov's email address My new address is alexey.klimov@linaro.org Link: https://lkml.kernel.org/r/20241008132353.68767-1-alexey.klimov@linaro.org Signed-off-by: Alexey Klimov Cc: Srinivas Kandagatla Signed-off-by: Andrew Morton --- .mailmap | 1 + MAINTAINERS | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.mailmap b/.mailmap index 5378f04b25668..b2cc9ee33caa1 100644 --- a/.mailmap +++ b/.mailmap @@ -37,6 +37,7 @@ Alexei Avshalom Lazar Alexei Starovoitov Alexei Starovoitov Alexei Starovoitov +Alexey Klimov Alexey Makhalov Alex Elder Alex Elder diff --git a/MAINTAINERS b/MAINTAINERS index bdae0faf000c7..27005b0fada97 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7855,7 +7855,7 @@ F: Documentation/gpu/automated_testing.rst F: drivers/gpu/drm/ci/ DSBR100 USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -13568,7 +13568,7 @@ Q: http://patchwork.linuxtv.org/project/linux-media/list/ F: drivers/media/dvb-frontends/m88rs2000* MA901 MASTERKIT USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git @@ -15704,7 +15704,7 @@ F: Documentation/hwmon/mp9941.rst F: drivers/hwmon/pmbus/mp9941.c MR800 AVERMEDIA USB FM RADIO DRIVER -M: Alexey Klimov +M: Alexey Klimov L: linux-media@vger.kernel.org S: Maintained T: git git://linuxtv.org/media_tree.git From ebcfc63d6bca3cce1bfca30092712f3468b4ecff Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 8 Oct 2024 11:47:45 +0530 Subject: [PATCH 065/215] mm: abstract THP allocation Patch series "Do not shatter hugezeropage on wp-fault", v7. It was observed at [1] and [2] that the current kernel behaviour of shattering a hugezeropage is inconsistent and suboptimal. For a VMA with a THP allowable order, when we write-fault on it, the kernel installs a PMD-mapped THP. On the other hand, if we first get a read fault, we get a PMD pointing to the hugezeropage; subsequent write will trigger a write-protection fault, shattering the hugezeropage into one writable page, and all the other PTEs write-protected. The conclusion being, as compared to the case of a single write-fault, applications have to suffer 512 extra page faults if they were to use the VMA as such, plus we get the overhead of khugepaged trying to replace that area with a THP anyway. Instead, replace the hugezeropage with a THP on wp-fault. [1]: https://lore.kernel.org/all/3743d7e1-0b79-4eaf-82d5-d1ca29fe347d@arm.com/ [2]: https://lore.kernel.org/all/1cfae0c0-96a2-4308-9c62-f7a640520242@arm.com/ This patch (of 2): In preparation for the second patch, abstract away the THP allocation logic present in the create_huge_pmd() path, which corresponds to the faulting case when no page is present. There should be no functional change as a result of applying this patch, except that, as David notes at [1], a PMD-aligned address should be passed to update_mmu_cache_pmd(). [1]: https://lore.kernel.org/all/ddd3fcd2-48b3-4170-bcaa-2fe66e093f43@redhat.com/ Link: https://lkml.kernel.org/r/20241008061746.285961-1-dev.jain@arm.com Link: https://lkml.kernel.org/r/20241008061746.285961-2-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Kefeng Wang Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dave Hansen Cc: Hugh Dickins Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 98 ++++++++++++++++++++++++++++-------------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e71b58d84cba2..4cf29ebed92bf 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1136,47 +1136,81 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, - struct page *page, gfp_t gfp) +static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, + unsigned long addr) { - struct vm_area_struct *vma = vmf->vma; - struct folio *folio = page_folio(page); - pgtable_t pgtable; - unsigned long haddr = vmf->address & HPAGE_PMD_MASK; - vm_fault_t ret = 0; + gfp_t gfp = vma_thp_gfp_mask(vma); + const int order = HPAGE_PMD_ORDER; + struct folio *folio; - VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + if (unlikely(!folio)) { + count_vm_event(THP_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + return NULL; + } + + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { folio_put(folio); count_vm_event(THP_FAULT_FALLBACK); count_vm_event(THP_FAULT_FALLBACK_CHARGE); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); - return VM_FAULT_FALLBACK; + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK); + count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); + return NULL; } folio_throttle_swaprate(folio, gfp); - pgtable = pte_alloc_one(vma->vm_mm); - if (unlikely(!pgtable)) { - ret = VM_FAULT_OOM; - goto release; - } - - folio_zero_user(folio, vmf->address); + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() * write. */ __folio_mark_uptodate(folio); + return folio; +} + +static void map_anon_folio_pmd(struct folio *folio, pmd_t *pmd, + struct vm_area_struct *vma, unsigned long haddr) +{ + pmd_t entry; + + entry = mk_huge_pmd(&folio->page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); + folio_add_lru_vma(folio, vma); + set_pmd_at(vma->vm_mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, haddr, pmd); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + count_vm_event(THP_FAULT_ALLOC); + count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); + count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); +} + +static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct folio *folio; + pgtable_t pgtable; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + pgtable = pte_alloc_one(vma->vm_mm); + if (unlikely(!pgtable)) { + ret = VM_FAULT_OOM; + goto release; + } vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); if (unlikely(!pmd_none(*vmf->pmd))) { goto unlock_release; } else { - pmd_t entry; - ret = check_stable_address_space(vma->vm_mm); if (ret) goto unlock_release; @@ -1190,21 +1224,11 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf, VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } - - entry = mk_huge_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - folio_add_new_anon_rmap(folio, vma, haddr, RMAP_EXCLUSIVE); - folio_add_lru_vma(folio, vma); pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); - add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); mm_inc_nr_ptes(vma->vm_mm); deferred_split_folio(folio, false); spin_unlock(vmf->ptl); - count_vm_event(THP_FAULT_ALLOC); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_ALLOC); - count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC); } return 0; @@ -1271,8 +1295,6 @@ static void set_huge_zero_folio(pgtable_t pgtable, struct mm_struct *mm, vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - gfp_t gfp; - struct folio *folio; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; vm_fault_t ret; @@ -1323,14 +1345,8 @@ vm_fault_t do_huge_pmd_anonymous_page(struct vm_fault *vmf) } return ret; } - gfp = vma_thp_gfp_mask(vma); - folio = vma_alloc_folio(gfp, HPAGE_PMD_ORDER, vma, haddr, true); - if (unlikely(!folio)) { - count_vm_event(THP_FAULT_FALLBACK); - count_mthp_stat(HPAGE_PMD_ORDER, MTHP_STAT_ANON_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } - return __do_huge_pmd_anonymous_page(vmf, &folio->page, gfp); + + return __do_huge_pmd_anonymous_page(vmf); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, From 1ced09e0331f6cc4ca7eae75bc0ef03957129a94 Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Tue, 8 Oct 2024 11:47:46 +0530 Subject: [PATCH 066/215] mm: allocate THP on hugezeropage wp-fault Introduce do_huge_zero_wp_pmd() to handle wp-fault on a hugezeropage and replace it with a PMD-mapped THP. Remember to flush TLB entry corresponding to the hugezeropage. In case of failure, fallback to splitting the PMD. Link: https://lkml.kernel.org/r/20241008061746.285961-3-dev.jain@arm.com Signed-off-by: Dev Jain Acked-by: David Hildenbrand Reviewed-by: Kefeng Wang Cc: Alistair Popple Cc: Aneesh Kumar K.V Cc: Anshuman Khandual Cc: Barry Song Cc: Catalin Marinas Cc: Christoph Lameter Cc: Dave Hansen Cc: Dev Jain Cc: Hugh Dickins Cc: Jan Kara Cc: Kirill A. Shutemov Cc: Lance Yang Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Cc: Ryan Roberts Cc: Vlastimil Babka Cc: Will Deacon Cc: Yang Shi Cc: Zi Yan Signed-off-by: Andrew Morton --- mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4cf29ebed92bf..c674afd16245c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1778,6 +1778,38 @@ void huge_pmd_set_accessed(struct vm_fault *vmf) spin_unlock(vmf->ptl); } +static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf) +{ + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + struct mmu_notifier_range range; + struct folio *folio; + vm_fault_t ret = 0; + + folio = vma_alloc_anon_folio_pmd(vma, vmf->address); + if (unlikely(!folio)) + return VM_FAULT_FALLBACK; + + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, haddr, + haddr + HPAGE_PMD_SIZE); + mmu_notifier_invalidate_range_start(&range); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmdp_get(vmf->pmd), vmf->orig_pmd))) + goto release; + ret = check_stable_address_space(vma->vm_mm); + if (ret) + goto release; + (void)pmdp_huge_clear_flush(vma, haddr, vmf->pmd); + map_anon_folio_pmd(folio, vmf->pmd, vma, haddr); + goto unlock; +release: + folio_put(folio); +unlock: + spin_unlock(vmf->ptl); + mmu_notifier_invalidate_range_end(&range); + return ret; +} + vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; @@ -1790,8 +1822,15 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); - if (is_huge_zero_pmd(orig_pmd)) + if (is_huge_zero_pmd(orig_pmd)) { + vm_fault_t ret = do_huge_zero_wp_pmd(vmf); + + if (!(ret & VM_FAULT_FALLBACK)) + return ret; + + /* Fallback to splitting PMD if THP cannot be allocated */ goto fallback; + } spin_lock(vmf->ptl); From 01a9097aa3ce4c6aef296779c163169ac403260e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 9 Oct 2024 13:28:00 +0900 Subject: [PATCH 067/215] zram: do not open-code comp priority 0 A cosmetic change: do not open-code compression priority 0, use ZRAM_PRIMARY_COMP instead. Link: https://lkml.kernel.org/r/20241009042908.750260-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 263795c4aef70..e6d12e81241d8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2285,7 +2285,7 @@ static void zram_destroy_comps(struct zram *zram) { u32 prio; - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { struct zcomp *comp = zram->comps[prio]; zram->comps[prio] = NULL; @@ -2357,7 +2357,7 @@ static ssize_t disksize_store(struct device *dev, goto out_unlock; } - for (prio = 0; prio < ZRAM_MAX_COMPS; prio++) { + for (prio = ZRAM_PRIMARY_COMP; prio < ZRAM_MAX_COMPS; prio++) { if (!zram->comp_algs[prio]) continue; From afe789b7367ad43ba8f079981d40851f8bd319ce Mon Sep 17 00:00:00 2001 From: John Hubbard Date: Tue, 8 Oct 2024 19:50:24 -0700 Subject: [PATCH 068/215] kaslr: rename physmem_end and PHYSMEM_END to direct_map_physmem_end For clarity. It's increasingly hard to reason about the code, when KASLR is moving around the boundaries. In this case where KASLR is randomizing the location of the kernel image within physical memory, the maximum number of address bits for physical memory has not changed. What has changed is the ending address of memory that is allowed to be directly mapped by the kernel. Let's name the variable, and the associated macro accordingly. Also, enhance the comment above the direct_map_physmem_end definition, to further clarify how this all works. Link: https://lkml.kernel.org/r/20241009025024.89813-1-jhubbard@nvidia.com Signed-off-by: John Hubbard Reviewed-by: Pankaj Gupta Acked-by: David Hildenbrand Acked-by: Will Deacon Reviewed-by: Mike Rapoport (Microsoft) Cc: Thomas Gleixner Cc: Alistair Popple Cc: Jordan Niethe Signed-off-by: Andrew Morton --- arch/arm64/include/asm/memory.h | 2 +- arch/x86/include/asm/page_64.h | 2 +- arch/x86/include/asm/pgtable_64_types.h | 2 +- arch/x86/mm/init_64.c | 2 +- arch/x86/mm/kaslr.c | 14 +++++++++----- include/linux/mm.h | 6 +++--- kernel/resource.c | 4 ++-- mm/memory_hotplug.c | 2 +- mm/sparse.c | 2 +- 9 files changed, 20 insertions(+), 16 deletions(-) diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index 0480c61dbb4f3..73eaa8c2536ae 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -110,7 +110,7 @@ #define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ -#define PHYSMEM_END __pa(PAGE_END - 1) +#define DIRECT_MAP_PHYSMEM_END __pa(PAGE_END - 1) #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index f3d257c452254..d63576608ce76 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -17,7 +17,7 @@ extern unsigned long phys_base; extern unsigned long page_offset_base; extern unsigned long vmalloc_base; extern unsigned long vmemmap_base; -extern unsigned long physmem_end; +extern unsigned long direct_map_physmem_end; static __always_inline unsigned long __phys_addr_nodebug(unsigned long x) { diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index a98e53491a4e6..ec68f8369bdca 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -141,7 +141,7 @@ extern unsigned int ptrs_per_p4d; #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ #ifdef CONFIG_RANDOMIZE_MEMORY -# define PHYSMEM_END physmem_end +# define DIRECT_MAP_PHYSMEM_END direct_map_physmem_end #endif /* diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index ff253648706fa..5a564130b9d0e 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -961,7 +961,7 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, unsigned long end = ((start_pfn + nr_pages) << PAGE_SHIFT) - 1; int ret; - if (WARN_ON_ONCE(end > PHYSMEM_END)) + if (WARN_ON_ONCE(end > DIRECT_MAP_PHYSMEM_END)) return -ERANGE; ret = __add_pages(nid, start_pfn, nr_pages, params); diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 230f1dee4f095..70d3353c92fac 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -52,7 +52,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { .base = &page_offset_base, - .end = &physmem_end, + .end = &direct_map_physmem_end, }, { .base = &vmalloc_base, @@ -62,8 +62,12 @@ static __initdata struct kaslr_memory_region { }, }; -/* The end of the possible address space for physical memory */ -unsigned long physmem_end __ro_after_init; +/* + * The end of the physical address space that can be mapped directly by the + * kernel. This starts out at (1< __START_KERNEL_map); /* Preset the end of the possible address space for physical memory */ - physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); + direct_map_physmem_end = ((1ULL << MAX_PHYSMEM_BITS) - 1); if (!kaslr_memory_enabled()) return; @@ -145,7 +149,7 @@ void __init kernel_randomize_memory(void) vaddr += get_padding(&kaslr_regions[i]); /* * KASLR trims the maximum possible size of the - * direct-map. Update the physmem_end boundary. + * direct-map. Update the direct_map_physmem_end boundary. * No rounding required as the region starts * PUD aligned and size is in units of TB. */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f5394d75ce23..4570f33e2429a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -97,11 +97,11 @@ extern const int mmap_rnd_compat_bits_max; extern int mmap_rnd_compat_bits __read_mostly; #endif -#ifndef PHYSMEM_END +#ifndef DIRECT_MAP_PHYSMEM_END # ifdef MAX_PHYSMEM_BITS -# define PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) +# define DIRECT_MAP_PHYSMEM_END ((1ULL << MAX_PHYSMEM_BITS) - 1) # else -# define PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) +# define DIRECT_MAP_PHYSMEM_END (((phys_addr_t)-1)&~(1ULL<<63)) # endif #endif diff --git a/kernel/resource.c b/kernel/resource.c index 4101016e8b205..d2c8143ae4ff9 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1869,7 +1869,7 @@ static resource_size_t gfr_start(struct resource *base, resource_size_t size, if (flags & GFR_DESCENDING) { resource_size_t end; - end = min_t(resource_size_t, base->end, PHYSMEM_END); + end = min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); return end - size + 1; } @@ -1886,7 +1886,7 @@ static bool gfr_continue(struct resource *base, resource_size_t addr, * @size did not wrap 0. */ return addr > addr - size && - addr <= min_t(resource_size_t, base->end, PHYSMEM_END); + addr <= min_t(resource_size_t, base->end, DIRECT_MAP_PHYSMEM_END); } static resource_size_t gfr_next(resource_size_t addr, resource_size_t size, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 621ae1015106c..c43b4e7fb2984 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1681,7 +1681,7 @@ struct range __weak arch_get_mappable_range(void) struct range mhp_get_pluggable_range(bool need_mapping) { - const u64 max_phys = PHYSMEM_END; + const u64 max_phys = DIRECT_MAP_PHYSMEM_END; struct range mhp_range; if (need_mapping) { diff --git a/mm/sparse.c b/mm/sparse.c index dc38539f85603..4cb9793f0b526 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -129,7 +129,7 @@ static inline int sparse_early_nid(struct mem_section *section) static void __meminit mminit_validate_memmodel_limits(unsigned long *start_pfn, unsigned long *end_pfn) { - unsigned long max_sparsemem_pfn = (PHYSMEM_END + 1) >> PAGE_SHIFT; + unsigned long max_sparsemem_pfn = (DIRECT_MAP_PHYSMEM_END + 1) >> PAGE_SHIFT; /* * Sanity checks - do not allow an architecture to pass From 002c5d1ca89c153e889e7fc3e0380cd807e40107 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Thu, 10 Oct 2024 18:54:39 +0300 Subject: [PATCH 069/215] mm/kmemleak: fix typo in object_no_scan() comment Replace "corresponding to the give pointer" with "corresponding to the given pointer" Link: https://lkml.kernel.org/r/20241010155439.554416-1-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Cc: Mike Rapoport (Microsoft) Signed-off-by: Andrew Morton --- mm/kmemleak.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 0400f5e8ac60d..17006d8a2afae 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1011,7 +1011,7 @@ static void object_set_excess_ref(unsigned long ptr, unsigned long excess_ref) } /* - * Set the OBJECT_NO_SCAN flag for the object corresponding to the give + * Set the OBJECT_NO_SCAN flag for the object corresponding to the given * pointer. Such object will not be scanned by kmemleak but references to it * are searched. */ From f8780515fe914ac03189213c7e485264d65e2ece Mon Sep 17 00:00:00 2001 From: MengEn Sun Date: Thu, 10 Oct 2024 20:09:36 +0800 Subject: [PATCH 070/215] mm: add pcp high_min high_max to proc zoneinfo When we do not set percpu_pagelist_high_fraction the kernel will compute the pcp high_min/max by itself, which makes it hard to determine the current high_min/max values. So output the pcp high_min/max values to /proc/zoneinfo. Link: https://lkml.kernel.org/r/20241010120935.656619-1-mengensun@tencent.com Signed-off-by: MengEn Sun Reviewed-by: Jinliang Zheng Signed-off-by: Andrew Morton --- mm/vmstat.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index b5a4cea423e17..1917c034c045b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1791,13 +1791,17 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, pcp = per_cpu_ptr(zone->per_cpu_pageset, i); seq_printf(m, "\n cpu: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", + "\n count: %i" + "\n high: %i" + "\n batch: %i" + "\n high_min: %i" + "\n high_max: %i", i, pcp->count, pcp->high, - pcp->batch); + pcp->batch, + pcp->high_min, + pcp->high_max); #ifdef CONFIG_SMP pzstats = per_cpu_ptr(zone->per_cpu_zonestats, i); seq_printf(m, "\n vm stats threshold: %d", From 6359c39c9de66dede8ff5ff257c9e117483dbc7c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 10 Oct 2024 14:15:56 +0800 Subject: [PATCH 071/215] mm: remove unused hugepage for vma_alloc_folio() The hugepage parameter was deprecated since commit ddc1a5cbc05d ("mempolicy: alloc_pages_mpol() for NUMA policy without vma"), for PMD-sized THP, it still tries only preferred node if possible in vma_alloc_folio() by checking the order of the folio allocation. Link: https://lkml.kernel.org/r/20241010061556.1846751-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- arch/alpha/include/asm/page.h | 2 +- arch/arm64/mm/fault.c | 2 +- arch/m68k/include/asm/page_no.h | 2 +- arch/s390/include/asm/page.h | 2 +- arch/x86/include/asm/page.h | 2 +- include/linux/gfp.h | 6 +++--- include/linux/highmem.h | 2 +- mm/huge_memory.c | 2 +- mm/ksm.c | 2 +- mm/memory.c | 10 ++++------ mm/mempolicy.c | 3 +-- mm/userfaultfd.c | 2 +- 12 files changed, 17 insertions(+), 20 deletions(-) diff --git a/arch/alpha/include/asm/page.h b/arch/alpha/include/asm/page.h index 70419e6be1a35..3dffa2a461d70 100644 --- a/arch/alpha/include/asm/page.h +++ b/arch/alpha/include/asm/page.h @@ -18,7 +18,7 @@ extern void clear_page(void *page); #define clear_user_page(page, vaddr, pg) clear_page(page) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) extern void copy_page(void * _to, void * _from); #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 8b281cf308b30..d95dca561f7a0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -983,7 +983,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, if (vma->vm_flags & VM_MTE) flags |= __GFP_ZEROTAGS; - return vma_alloc_folio(flags, 0, vma, vaddr, false); + return vma_alloc_folio(flags, 0, vma, vaddr); } void tag_clear_highpage(struct page *page) diff --git a/arch/m68k/include/asm/page_no.h b/arch/m68k/include/asm/page_no.h index af3a10973233c..63c0e706084b1 100644 --- a/arch/m68k/include/asm/page_no.h +++ b/arch/m68k/include/asm/page_no.h @@ -14,7 +14,7 @@ extern unsigned long memory_end; #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #define __pa(vaddr) ((unsigned long)(vaddr)) #define __va(paddr) ((void *)((unsigned long)(paddr))) diff --git a/arch/s390/include/asm/page.h b/arch/s390/include/asm/page.h index 73e1e03317b43..d02058f96bcfd 100644 --- a/arch/s390/include/asm/page.h +++ b/arch/s390/include/asm/page.h @@ -74,7 +74,7 @@ static inline void copy_page(void *to, void *from) #define copy_user_page(to, from, vaddr, pg) copy_page(to, from) #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) /* * These are used to make use of C type-checking.. diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 1b93ff80b43bc..c9fe207916f48 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -35,7 +35,7 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, } #define vma_alloc_zeroed_movable_folio(vma, vaddr) \ - vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr, false) + vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr) #ifndef __pa #define __pa(x) __phys_addr((unsigned long)(x)) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index a951de920e208..b65724c3427de 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -306,7 +306,7 @@ struct folio *folio_alloc_noprof(gfp_t gfp, unsigned int order); struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, struct mempolicy *mpol, pgoff_t ilx, int nid); struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage); + unsigned long addr); #else static inline struct page *alloc_pages_noprof(gfp_t gfp_mask, unsigned int order) { @@ -326,7 +326,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde { return folio_alloc_noprof(gfp, order); } -#define vma_alloc_folio_noprof(gfp, order, vma, addr, hugepage) \ +#define vma_alloc_folio_noprof(gfp, order, vma, addr) \ folio_alloc_noprof(gfp, order) #endif @@ -341,7 +341,7 @@ static inline struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int orde static inline struct page *alloc_page_vma_noprof(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { - struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr, false); + struct folio *folio = vma_alloc_folio_noprof(gfp, 0, vma, addr); return &folio->page; } diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 930a591b9b616..bec9bd715acf9 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -226,7 +226,7 @@ struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, { struct folio *folio; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); if (folio) clear_user_highpage(&folio->page, vaddr); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c674afd16245c..387c046a389e7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,7 +1143,7 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, const int order = HPAGE_PMD_ORDER; struct folio *folio; - folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK, true); + folio = vma_alloc_folio(gfp, order, vma, addr & HPAGE_PMD_MASK); if (unlikely(!folio)) { count_vm_event(THP_FAULT_FALLBACK); diff --git a/mm/ksm.c b/mm/ksm.c index 556b8a8f37d04..e596bc1b5fa7a 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2971,7 +2971,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, if (!folio_test_uptodate(folio)) return folio; /* let do_swap_page report the error */ - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (new_folio && mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL)) { folio_put(new_folio); diff --git a/mm/memory.c b/mm/memory.c index 5e9d6a22eb088..c51bc45a70099 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1059,8 +1059,7 @@ static inline struct folio *folio_prealloc(struct mm_struct *src_mm, if (need_zero) new_folio = vma_alloc_zeroed_movable_folio(vma, addr); else - new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - addr, false); + new_folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, addr); if (!new_folio) return NULL; @@ -4017,8 +4016,7 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) struct folio *folio; swp_entry_t entry; - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, - vmf->address, false); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vmf->address); if (!folio) return NULL; @@ -4174,7 +4172,7 @@ static struct folio *alloc_swap_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (!mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, gfp, entry)) @@ -4713,7 +4711,7 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) gfp = vma_thp_gfp_mask(vma); while (orders) { addr = ALIGN_DOWN(vmf->address, PAGE_SIZE << order); - folio = vma_alloc_folio(gfp, order, vma, addr, true); + folio = vma_alloc_folio(gfp, order, vma, addr); if (folio) { if (mem_cgroup_charge(folio, vma->vm_mm, gfp)) { count_mthp_stat(order, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9e18a6fc30617..a29eff5d0585d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2290,7 +2290,6 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * @order: Order of the folio. * @vma: Pointer to VMA. * @addr: Virtual address of the allocation. Must be inside @vma. - * @hugepage: Unused (was: For hugepages try only preferred node if possible). * * Allocate a folio for a specific address in @vma, using the appropriate * NUMA policy. The caller must hold the mmap_lock of the mm_struct of the @@ -2301,7 +2300,7 @@ struct folio *folio_alloc_mpol_noprof(gfp_t gfp, unsigned int order, * Return: The folio on success or NULL if allocation fails. */ struct folio *vma_alloc_folio_noprof(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, bool hugepage) + unsigned long addr) { struct mempolicy *pol; pgoff_t ilx; diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 48b87c62fc3dd..60a0be33766ff 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -251,7 +251,7 @@ static int mfill_atomic_pte_copy(pmd_t *dst_pmd, if (!*foliop) { ret = -ENOMEM; folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, dst_vma, - dst_addr, false); + dst_addr); if (!folio) goto out; From 0aa3ef3637920799f1b2f67dfff0d698127444ac Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 9 Oct 2024 17:35:50 -0700 Subject: [PATCH 072/215] memcg: add tracing for memcg stat updates The memcg stats are maintained in rstat infrastructure which provides very fast updates side and reasonable read side. However memcg added plethora of stats and made the read side, which is cgroup rstat flush, very slow. To solve that, threshold was added in the memcg stats read side i.e. no need to flush the stats if updates are within the threshold. This threshold based improvement worked for sometime but more stats were added to memcg and also the read codepath was getting triggered in the performance sensitive paths which made threshold based ratelimiting ineffective. We need more visibility into the hot and cold stats i.e. stats with a lot of updates. Let's add trace to get that visibility. [shakeel.butt@linux.dev: use unsigned long type for memcg_rstat_events, per Yosry] Link: https://lkml.kernel.org/r/20241015213721.3804209-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20241010003550.3695245-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Reviewed-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: T.J. Mercier Cc: Michal Hocko Cc: Muchun Song Cc: JP Kobryn Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 81 ++++++++++++++++++++++++++++++++++++ mm/memcontrol.c | 13 +++++- 2 files changed, 92 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/memcg.h diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h new file mode 100644 index 0000000000000..8667e57816d21 --- /dev/null +++ b/include/trace/events/memcg.h @@ -0,0 +1,81 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM memcg + +#if !defined(_TRACE_MEMCG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MEMCG_H + +#include +#include + + +DECLARE_EVENT_CLASS(memcg_rstat_stats, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(int, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%d", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DEFINE_EVENT(memcg_rstat_stats, mod_memcg_lruvec_state, + + TP_PROTO(struct mem_cgroup *memcg, int item, int val), + + TP_ARGS(memcg, item, val) +); + +DECLARE_EVENT_CLASS(memcg_rstat_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val), + + TP_STRUCT__entry( + __field(u64, id) + __field(int, item) + __field(unsigned long, val) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->item = item; + __entry->val = val; + ), + + TP_printk("memcg_id=%llu item=%d val=%lu", + __entry->id, __entry->item, __entry->val) +); + +DEFINE_EVENT(memcg_rstat_events, count_memcg_events, + + TP_PROTO(struct mem_cgroup *memcg, int item, unsigned long val), + + TP_ARGS(memcg, item, val) +); + + +#endif /* _TRACE_MEMCG_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6159266185f5..c93ecedf7a965 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -71,6 +71,10 @@ #include +#define CREATE_TRACE_POINTS +#include +#undef CREATE_TRACE_POINTS + #include struct cgroup_subsys memory_cgrp_subsys __read_mostly; @@ -682,7 +686,9 @@ void __mod_memcg_state(struct mem_cgroup *memcg, enum memcg_stat_item idx, return; __this_cpu_add(memcg->vmstats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_state(memcg, idx, val); } /* idx can be of type enum memcg_stat_item or node_stat_item. */ @@ -741,7 +747,9 @@ static void __mod_memcg_lruvec_state(struct lruvec *lruvec, /* Update lruvec */ __this_cpu_add(pn->lruvec_stats_percpu->state[i], val); - memcg_rstat_updated(memcg, memcg_state_val_in_pages(idx, val)); + val = memcg_state_val_in_pages(idx, val); + memcg_rstat_updated(memcg, val); + trace_mod_memcg_lruvec_state(memcg, idx, val); memcg_stats_unlock(); } @@ -832,6 +840,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, memcg_stats_lock(); __this_cpu_add(memcg->vmstats_percpu->events[i], count); memcg_rstat_updated(memcg, count); + trace_count_memcg_events(memcg, idx, count); memcg_stats_unlock(); } From 7e1fbaa0df1dfc7b820cd41be0b2c7535d3e983a Mon Sep 17 00:00:00 2001 From: suhua Date: Sat, 12 Oct 2024 15:08:02 +0800 Subject: [PATCH 073/215] mm/hugetlb: perform vmemmap optimization batchly for specific node allocation When HVO is enabled and huge page memory allocs are made, the freed memory can be aggregated into higher order memory in the following paths, which facilitates further allocs for higher order memory. echo 200000 > /proc/sys/vm/nr_hugepages echo 200000 > /sys/devices/system/node/node*/hugepages/hugepages-2048kB/nr_hugepages grub default_hugepagesz=2M hugepagesz=2M hugepages=200000 Currently not support for releasing aggregations to higher order in the following way, which will releasing to lower order. grub: default_hugepagesz=2M hugepagesz=2M hugepages=0:100000,1:100000 This patch supports the release of huge page optimizations aggregates to higher order memory. eg: cat /proc/cmdline BOOT_IMAGE=/boot/vmlinuz-xxx ... default_hugepagesz=2M hugepagesz=2M hugepages=0:100000,1:100000 Before: Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Unmovable 55282 97039 99307 0 1 1 0 1 1 1 0 Node 0, zone Normal, type Movable 25 11 345 87 48 21 2 20 9 3 75061 Node 0, zone Normal, type Reclaimable 4 2 2 4 3 0 2 1 1 1 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 ... Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 1, zone Normal, type Unmovable 98888 99650 99679 2 3 1 2 2 2 0 0 Node 1, zone Normal, type Movable 1 1 0 1 1 0 1 0 1 1 75937 Node 1, zone Normal, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 1, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 After: Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 ... Node 0, zone Normal, type Unmovable 152 158 37 2 2 0 3 4 2 6 717 Node 0, zone Normal, type Movable 1 37 53 3 55 49 16 6 2 1 75000 Node 0, zone Normal, type Reclaimable 1 4 3 1 2 1 1 1 1 1 0 Node 0, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 ... Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10 Node 1, zone Normal, type Unmovable 5 3 2 1 3 4 2 2 2 0 779 Node 1, zone Normal, type Movable 1 0 1 1 1 0 1 0 1 1 75849 Node 1, zone Normal, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0 Node 1, zone Normal, type HighAtomic 0 0 0 0 0 0 0 0 0 0 0 Link: https://lkml.kernel.org/r/20241012070802.1876-1-suhua1@kingsoft.com Signed-off-by: suhua Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 190fa05635f4a..906294ac85dc8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3301,6 +3301,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) { unsigned long i; char buf[32]; + LIST_HEAD(folio_list); for (i = 0; i < h->max_huge_pages_node[nid]; ++i) { if (hstate_is_gigantic(h)) { @@ -3310,14 +3311,18 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) struct folio *folio; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; - folio = alloc_fresh_hugetlb_folio(h, gfp_mask, nid, - &node_states[N_MEMORY]); + folio = only_alloc_fresh_hugetlb_folio(h, gfp_mask, nid, + &node_states[N_MEMORY], NULL); if (!folio) break; - free_huge_folio(folio); /* free it into the hugepage allocator */ + list_add(&folio->lru, &folio_list); } cond_resched(); } + + if (!list_empty(&folio_list)) + prep_and_add_allocated_folios(h, &folio_list); + if (i == h->max_huge_pages_node[nid]) return; From f0c99037a0c6301ca8c3e41162dd0426b5d38abe Mon Sep 17 00:00:00 2001 From: Sidhartha Kumar Date: Fri, 11 Oct 2024 17:44:51 -0400 Subject: [PATCH 074/215] maple_tree: refactor mas_wr_store_type() In mas_wr_store_type(), we check if new_end < mt_slots[wr_mas->type]. If this check fails, we know that ,after this, new_end is >= mt_min_slots. Checking this again when we detect a wr_node_store later in the function is reduntant. Because this check is part of an OR statement, the statement will always evaluate to true, therefore we can just get rid of it. We also refactor mas_wr_store_type() to return the store type rather than set it directly as it greatly cleans up the function. Link: https://lkml.kernel.org/r/20241011214451.7286-2-sidhartha.kumar@oracle.com Signed-off-by: Sidhartha Suggested-by: Liam Howlett Suggested-by: Wei Yang Reviewed-by: Wei Yang Reviewed-by: Liam Howlett Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- lib/maple_tree.c | 72 +++++++++++++++++------------------------------- 1 file changed, 25 insertions(+), 47 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index b3b1d4b8126b4..a5e982e482dd4 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4191,24 +4191,22 @@ static inline int mas_prealloc_calc(struct ma_state *mas, void *entry) } /* - * mas_wr_store_type() - Set the store type for a given + * mas_wr_store_type() - Determine the store type for a given * store operation. * @wr_mas: The maple write state + * + * Return: the type of store needed for the operation */ -static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) +static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) { struct ma_state *mas = wr_mas->mas; unsigned char new_end; - if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) { - mas->store_type = wr_store_root; - return; - } + if (unlikely(mas_is_none(mas) || mas_is_ptr(mas))) + return wr_store_root; - if (unlikely(!mas_wr_walk(wr_mas))) { - mas->store_type = wr_spanning_store; - return; - } + if (unlikely(!mas_wr_walk(wr_mas))) + return wr_spanning_store; /* At this point, we are at the leaf node that needs to be altered. */ mas_wr_end_piv(wr_mas); @@ -4216,50 +4214,30 @@ static inline void mas_wr_store_type(struct ma_wr_state *wr_mas) mas_wr_extend_null(wr_mas); new_end = mas_wr_new_end(wr_mas); - if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) { - mas->store_type = wr_exact_fit; - return; - } + if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) + return wr_exact_fit; - if (unlikely(!mas->index && mas->last == ULONG_MAX)) { - mas->store_type = wr_new_root; - return; - } + if (unlikely(!mas->index && mas->last == ULONG_MAX)) + return wr_new_root; /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { - if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_rebalance; - return; - } - mas->store_type = wr_node_store; - return; + if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) + return wr_rebalance; + return wr_node_store; } - if (new_end >= mt_slots[wr_mas->type]) { - mas->store_type = wr_split_store; - return; - } + if (new_end >= mt_slots[wr_mas->type]) + return wr_split_store; - if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) { - mas->store_type = wr_append; - return; - } + if (!mt_in_rcu(mas->tree) && (mas->offset == mas->end)) + return wr_append; if ((new_end == mas->end) && (!mt_in_rcu(mas->tree) || - (wr_mas->offset_end - mas->offset == 1))) { - mas->store_type = wr_slot_store; - return; - } - - if (mte_is_root(mas->node) || (new_end >= mt_min_slots[wr_mas->type]) || - (mas->mas_flags & MA_STATE_BULK)) { - mas->store_type = wr_node_store; - return; - } + (wr_mas->offset_end - mas->offset == 1))) + return wr_slot_store; - mas->store_type = wr_invalid; - MAS_WARN_ON(mas, 1); + return wr_node_store; } /** @@ -4274,7 +4252,7 @@ static inline void mas_wr_preallocate(struct ma_wr_state *wr_mas, void *entry) int request; mas_wr_prealloc_setup(wr_mas); - mas_wr_store_type(wr_mas); + mas->store_type = mas_wr_store_type(wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return; @@ -5446,7 +5424,7 @@ void *mas_store(struct ma_state *mas, void *entry) * overwrite multiple entries within a self-balancing B-Tree. */ mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); if (mas->mas_flags & MA_STATE_PREALLOC) { mas_wr_store_entry(&wr_mas); MAS_WR_BUG_ON(&wr_mas, mas_is_err(mas)); @@ -5549,7 +5527,7 @@ int mas_preallocate(struct ma_state *mas, void *entry, gfp_t gfp) int request; mas_wr_prealloc_setup(&wr_mas); - mas_wr_store_type(&wr_mas); + mas->store_type = mas_wr_store_type(&wr_mas); request = mas_prealloc_calc(mas, entry); if (!request) return ret; From 773ee2cda50c46e582a8ee2f8f00a5c8ac2923a7 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Sat, 12 Oct 2024 01:19:50 +0800 Subject: [PATCH 075/215] mm/zswap: avoid touching XArray for unnecessary invalidation zswap_invalidation simply calls xa_erase, which acquires the Xarray lock first, then does a look up. This has a higher overhead even if zswap is not used or the tree is empty. So instead, do a very lightweight xa_empty check first, if there is nothing to erase, don't touch the lock or the tree. Using xa_empty rather than zswap_never_enabled is more helpful as it cover both case where zswap wes never used or the particular range doesn't have any zswap entry. And it's safe as the swap slot should be currently pinned by caller with HAS_CACHE. Sequential SWAP in/out tests with zswap disabled showed a minor performance gain, SWAP in of zero page with zswap enabled also showed a performance gain. (swapout is basically unchanged so only test one case): Swapout of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, +0.1%): Before: 1705013 us 1703119 us 1704335 us 1705848 us. After: 1703579 us 1710640 us 1703625 us 1708699 us. Swapin of 2G zero page using brd as SWAP, zswap disabled (total time, 4 testrun, -3.5%): Before: 1912312 us 1915692 us 1905837 us 1912706 us. After: 1845354 us 1849691 us 1845868 us 1841828 us. Swapin of 2G zero page using brd as SWAP, zswap enabled (total time, 4 testrun, -3.3%): Before: 1897994 us 1894681 us 1899982 us 1898333 us After: 1835894 us 1834113 us 1832047 us 1833125 us Swapin of 2G random page using brd as SWAP, zswap enabled (total time, 4 testrun, -0.1%): Before: 4519747 us 4431078 us 4430185 us 4439999 us After: 4492176 us 4437796 us 4434612 us 4434289 us And the performance is very slightly better or unchanged for build kernel test with zswap enabled or disabled. Build Linux Kernel with defconfig and -j32 in 1G memory cgroup, using brd SWAP, zswap disabled (sys time in seconds, 6 testrun, -0.1%): Before: 1648.83 1653.52 1666.34 1665.95 1663.06 1656.67 After: 1651.36 1661.89 1645.70 1657.45 1662.07 1652.83 Build Linux Kernel with defconfig and -j32 in 2G memory cgroup, using brd SWAP zswap enabled (sys time in seconds, 6 testrun, -0.3%): Before: 1240.25 1254.06 1246.77 1265.92 1244.23 1227.74 After: 1226.41 1218.21 1249.12 1249.13 1244.39 1233.01 Link: https://lkml.kernel.org/r/20241011171950.62684-1-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Yosry Ahmed Cc: Barry Song Cc: Chengming Zhou Cc: Chris Li Cc: Johannes Weiner Cc: Nhat Pham Signed-off-by: Andrew Morton --- mm/zswap.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/zswap.c b/mm/zswap.c index 162013952074b..a9f4fa121eb2d 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1594,6 +1594,9 @@ void zswap_invalidate(swp_entry_t swp) struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry; + if (xa_empty(tree)) + return; + entry = xa_erase(tree, offset); if (entry) zswap_entry_free(entry); From 5708d96da20b99b4665ad72395e3727016057f70 Mon Sep 17 00:00:00 2001 From: Zi Yan Date: Fri, 11 Oct 2024 11:03:04 -0400 Subject: [PATCH 076/215] mm: avoid zeroing user movable page twice with init_on_alloc=1 Commit 6471384af2a6 ("mm: security: introduce init_on_alloc=1 and init_on_free=1 boot options") forces allocated page to be zeroed in post_alloc_hook() when init_on_alloc=1. For order-0 folios, if arch does not define vma_alloc_zeroed_movable_folio(), the default implementation again zeros the page return from the buddy allocator. So the page is zeroed twice. Fix it by passing __GFP_ZERO instead to avoid double page zeroing. At the moment, s390,arm64,x86,alpha,m68k are not impacted since they define their own vma_alloc_zeroed_movable_folio(). For >0 order folios (mTHP and PMD THP), folio_zero_user() is called to zero the folio again. Fix it by calling folio_zero_user() only if init_on_alloc is set. All arch are impacted. Add alloc_zeroed() helper to encapsulate the init_on_alloc check. [ziy@nvidia.com: comment fixes, per David] Link: https://lkml.kernel.org/r/97DB52E1-C594-49B5-9736-89AC302FAB01@nvidia.com Link: https://lkml.kernel.org/r/20241011150304.709590-1-ziy@nvidia.com Signed-off-by: Zi Yan Acked-by: Vlastimil Babka Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: "Huang, Ying" Cc: John Hubbard Cc: Kees Cook Cc: Kefeng Wang Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/highmem.h | 8 +------- mm/huge_memory.c | 8 +++++++- mm/internal.h | 6 ++++++ mm/memory.c | 10 +++++++++- 4 files changed, 23 insertions(+), 9 deletions(-) diff --git a/include/linux/highmem.h b/include/linux/highmem.h index bec9bd715acf9..6e452bd8e7e36 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -224,13 +224,7 @@ static inline struct folio *vma_alloc_zeroed_movable_folio(struct vm_area_struct *vma, unsigned long vaddr) { - struct folio *folio; - - folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, vma, vaddr); - if (folio) - clear_user_highpage(&folio->page, vaddr); - - return folio; + return vma_alloc_folio(GFP_HIGHUSER_MOVABLE | __GFP_ZERO, 0, vma, vaddr); } #endif diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 387c046a389e7..73194aa0544ce 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1162,7 +1162,13 @@ static struct folio *vma_alloc_anon_folio_pmd(struct vm_area_struct *vma, } folio_throttle_swaprate(folio, gfp); - folio_zero_user(folio, addr); + /* + * When a folio is not zeroed during allocation (__GFP_ZERO not used), + * folio_zero_user() is used to make sure that the page corresponding + * to the faulting address will be hot in the cache after zeroing. + */ + if (!alloc_zeroed()) + folio_zero_user(folio, addr); /* * The memory barrier inside __folio_mark_uptodate makes sure that * folio_zero_user writes become visible before the set_pmd_at() diff --git a/mm/internal.h b/mm/internal.h index fc2f523258a36..c743c2b21dbac 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1276,6 +1276,12 @@ void touch_pud(struct vm_area_struct *vma, unsigned long addr, void touch_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, bool write); +static inline bool alloc_zeroed(void) +{ + return static_branch_maybe(CONFIG_INIT_ON_ALLOC_DEFAULT_ON, + &init_on_alloc); +} + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, diff --git a/mm/memory.c b/mm/memory.c index c51bc45a70099..68e57b33363b4 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4719,7 +4719,15 @@ static struct folio *alloc_anon_folio(struct vm_fault *vmf) goto next; } folio_throttle_swaprate(folio, gfp); - folio_zero_user(folio, vmf->address); + /* + * When a folio is not zeroed during allocation + * (__GFP_ZERO not used), folio_zero_user() is used + * to make sure that the page corresponding to the + * faulting address will be hot in the cache after + * zeroing. + */ + if (!alloc_zeroed()) + folio_zero_user(folio, vmf->address); return folio; } next: From 1f2d03cc535138b7cdbed0122cdc0f9e9626c6bf Mon Sep 17 00:00:00 2001 From: Jaewon Kim Date: Fri, 11 Oct 2024 21:49:28 +0900 Subject: [PATCH 077/215] vmscan: add a vmscan event for reclaim_pages reclaim_folio_list uses a dummy reclaim_stat and is not being used. To know the memory stat, add a new trace event. This is useful how how many pages are not reclaimed or why. This is an example: mm_vmscan_reclaim_pages: nid=0 nr_scanned=112 nr_reclaimed=112 nr_dirty=0 nr_writeback=0 nr_congested=0 nr_immediate=0 nr_activate_anon=0 nr_activate_file=0 nr_ref_keep=0 nr_unmap_fail=0 Currently reclaim_folio_list is only called by reclaim_pages, and reclaim_pages is used by damon and madvise. In the latest Android, reclaim_pages is also used by shmem to reclaim all pages in a address_space. [jaewon31.kim@samsung.com: use sc.nr_scanned rather than new counting] Link: https://lkml.kernel.org/r/20241016143227.961162-1-jaewon31.kim@samsung.com Link: https://lkml.kernel.org/r/20241011124928.1224813-1-jaewon31.kim@samsung.com Signed-off-by: Jaewon Kim Acked-by: Vlastimil Babka Cc: Jaewon Kim Cc: Kalesh Singh Cc: Minchan Kim Cc: SeongJae Park Signed-off-by: Andrew Morton --- include/trace/events/vmscan.h | 45 +++++++++++++++++++++++++++++++++++ mm/vmscan.c | 5 ++-- 2 files changed, 48 insertions(+), 2 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 1a488c30afa53..490958fa10dee 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -346,6 +346,51 @@ TRACE_EVENT(mm_vmscan_write_folio, show_reclaim_flags(__entry->reclaim_flags)) ); +TRACE_EVENT(mm_vmscan_reclaim_pages, + + TP_PROTO(int nid, + unsigned long nr_scanned, unsigned long nr_reclaimed, + struct reclaim_stat *stat), + + TP_ARGS(nid, nr_scanned, nr_reclaimed, stat), + + TP_STRUCT__entry( + __field(int, nid) + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_reclaimed) + __field(unsigned long, nr_dirty) + __field(unsigned long, nr_writeback) + __field(unsigned long, nr_congested) + __field(unsigned long, nr_immediate) + __field(unsigned int, nr_activate0) + __field(unsigned int, nr_activate1) + __field(unsigned long, nr_ref_keep) + __field(unsigned long, nr_unmap_fail) + ), + + TP_fast_assign( + __entry->nid = nid; + __entry->nr_scanned = nr_scanned; + __entry->nr_reclaimed = nr_reclaimed; + __entry->nr_dirty = stat->nr_dirty; + __entry->nr_writeback = stat->nr_writeback; + __entry->nr_congested = stat->nr_congested; + __entry->nr_immediate = stat->nr_immediate; + __entry->nr_activate0 = stat->nr_activate[0]; + __entry->nr_activate1 = stat->nr_activate[1]; + __entry->nr_ref_keep = stat->nr_ref_keep; + __entry->nr_unmap_fail = stat->nr_unmap_fail; + ), + + TP_printk("nid=%d nr_scanned=%ld nr_reclaimed=%ld nr_dirty=%ld nr_writeback=%ld nr_congested=%ld nr_immediate=%ld nr_activate_anon=%d nr_activate_file=%d nr_ref_keep=%ld nr_unmap_fail=%ld", + __entry->nid, + __entry->nr_scanned, __entry->nr_reclaimed, + __entry->nr_dirty, __entry->nr_writeback, + __entry->nr_congested, __entry->nr_immediate, + __entry->nr_activate0, __entry->nr_activate1, + __entry->nr_ref_keep, __entry->nr_unmap_fail) +); + TRACE_EVENT(mm_vmscan_lru_shrink_inactive, TP_PROTO(int nid, diff --git a/mm/vmscan.c b/mm/vmscan.c index 6a3c498383fa1..5bec29914f122 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2129,7 +2129,7 @@ static void shrink_active_list(unsigned long nr_to_scan, static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { - struct reclaim_stat dummy_stat; + struct reclaim_stat stat; unsigned int nr_reclaimed; struct folio *folio; struct scan_control sc = { @@ -2140,12 +2140,13 @@ static unsigned int reclaim_folio_list(struct list_head *folio_list, .no_demotion = 1, }; - nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, true); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &stat, true); while (!list_empty(folio_list)) { folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } + trace_mm_vmscan_reclaim_pages(pgdat->node_id, sc.nr_scanned, nr_reclaimed, &stat); return nr_reclaimed; } From f69c2e4dc6840cf93a3370853966657aca9f13c6 Mon Sep 17 00:00:00 2001 From: Saurabh Sengar Date: Sun, 11 Aug 2024 23:13:40 -0700 Subject: [PATCH 078/215] mm/vmstat: defer the refresh_zone_stat_thresholds after all CPUs bringup refresh_zone_stat_thresholds function has two loops which is expensive for higher number of CPUs and NUMA nodes. Below is the rough estimation of total iterations done by these loops based on number of NUMA and CPUs. Total number of iterations: nCPU * 2 * Numa * mCPU Where: nCPU = total number of CPUs Numa = total number of NUMA nodes mCPU = mean value of total CPUs (e.g., 512 for 1024 total CPUs) For the system under test with 16 NUMA nodes and 1024 CPUs, this results in a substantial increase in the number of loop iterations during boot-up when NUMA is enabled: No NUMA = 1024*2*1*512 = 1,048,576 : Here refresh_zone_stat_thresholds takes around 224 ms total for all the CPUs in the system under test. 16 NUMA = 1024*2*16*512 = 16,777,216 : Here refresh_zone_stat_thresholds takes around 4.5 seconds total for all the CPUs in the system under test. Calling this for each CPU is expensive when there are large number of CPUs along with multiple NUMAs. Fix this by deferring refresh_zone_stat_thresholds to be called later at once when all the secondary CPUs are up. Also, register the DYN hooks to keep the existing hotplug functionality intact. Link: https://lkml.kernel.org/r/1723443220-20623-1-git-send-email-ssengar@linux.microsoft.com Signed-off-by: Saurabh Sengar Acked-by: Christoph Lameter Reviewed-by: Srivatsa S. Bhat (Microsoft) Cc: Saurabh Singh Sengar Cc: Wei Liu Cc: Mel Gorman Cc: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/vmstat.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 1917c034c045b..7b62bfb19afab 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1933,6 +1933,7 @@ static const struct seq_operations vmstat_op = { #ifdef CONFIG_SMP static DEFINE_PER_CPU(struct delayed_work, vmstat_work); int sysctl_stat_interval __read_mostly = HZ; +static int vmstat_late_init_done; #ifdef CONFIG_PROC_FS static void refresh_vm_stats(struct work_struct *work) @@ -2135,7 +2136,8 @@ static void __init init_cpu_node_state(void) static int vmstat_cpu_online(unsigned int cpu) { - refresh_zone_stat_thresholds(); + if (vmstat_late_init_done) + refresh_zone_stat_thresholds(); if (!node_state(cpu_to_node(cpu), N_CPU)) { node_set_state(cpu_to_node(cpu), N_CPU); @@ -2167,6 +2169,14 @@ static int vmstat_cpu_dead(unsigned int cpu) return 0; } +static int __init vmstat_late_init(void) +{ + refresh_zone_stat_thresholds(); + vmstat_late_init_done = 1; + + return 0; +} +late_initcall(vmstat_late_init); #endif struct workqueue_struct *mm_percpu_wq; From 5b2100f723bd5c2b5552b27208f3e7d7447910d3 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Wed, 26 Jun 2024 12:06:30 -0400 Subject: [PATCH 079/215] maple_tree: fix alloc node fail issue In the following code, the second call to the mas_node_count will return -ENOMEM: mas_node_count(mas, MAPLE_ALLOC_SLOTS + 1); mas_node_count(mas, MAPLE_ALLOC_SLOTS * 2 + 2); This is because there may be some full maple_alloc node in current maple state. Use full maple_alloc node will make max_req equal to 0. And it leads to mt_alloc_bulk return 0. As a result, mas_node_count set mas.node to MA_ERROR(-ENOMEM). Find a non-full maple_alloc node, and if necessary, use this non-full node in the next while loop. Link: https://lkml.kernel.org/r/20240626160631.3636515-1-Liam.Howlett@oracle.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Jiazi Li Signed-off-by: Liam R. Howlett Suggested-by: Liam R. Howlett Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index a5e982e482dd4..cdac15168405b 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1285,7 +1285,10 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) node->node_count += count; allocated += count; - node = node->slot[0]; + /* find a non-full node*/ + do { + node = node->slot[0]; + } while (unlikely(node->node_count == MAPLE_ALLOC_SLOTS)); requested -= count; } mas->alloc->total = allocated; From 0f85eb3395c74d7cc823169bbacc670c6645ae80 Mon Sep 17 00:00:00 2001 From: Jiazi Li Date: Wed, 26 Jun 2024 12:06:31 -0400 Subject: [PATCH 080/215] maple_tree: add some alloc node test case Add some maple_tree alloc node tese case. Link: https://lkml.kernel.org/r/20240626160631.3636515-2-Liam.Howlett@oracle.com Signed-off-by: Jiazi Li Signed-off-by: Liam R. Howlett Suggested-by: Liam R. Howlett Cc: Wei Yang Signed-off-by: Andrew Morton --- tools/testing/radix-tree/maple.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tools/testing/radix-tree/maple.c b/tools/testing/radix-tree/maple.c index 551ae6898c1d2..bc30050227fda 100644 --- a/tools/testing/radix-tree/maple.c +++ b/tools/testing/radix-tree/maple.c @@ -462,6 +462,28 @@ static noinline void __init check_new_node(struct maple_tree *mt) MT_BUG_ON(mt, mas_allocated(&mas) != 10 + MAPLE_ALLOC_SLOTS - 1); mas_destroy(&mas); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS + 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 1); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 2); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.status = ma_start; + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 2); + mas_destroy(&mas); + + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 2 + 1); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 2 + 1); + mas.node = MA_ERROR(-ENOMEM); + mas_node_count(&mas, MAPLE_ALLOC_SLOTS * 3 + 2); /* Request */ + mas_nomem(&mas, GFP_KERNEL); /* Fill request */ + mas.status = ma_start; + MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS * 3 + 2); + mas_destroy(&mas); + mtree_unlock(mt); } From 0cc8d68abe2fdcb7039ece95f784698c0b0dc51e Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Fri, 13 Sep 2024 06:31:28 +0000 Subject: [PATCH 081/215] maple_tree: root node could be handled by !p_slot too For a root node, mte_parent_slot() return 0, this exactly fits the following !p_slot check. So we can remove the special handling for root node. Link: https://lkml.kernel.org/r/20240913063128.27391-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index cdac15168405b..c2d3c8d273584 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2155,9 +2155,7 @@ static inline bool mas_prev_sibling(struct ma_state *mas) { unsigned int p_slot = mte_parent_slot(mas->node); - if (mte_is_root(mas->node)) - return false; - + /* For root node, p_slot is set to 0 by mte_parent_slot(). */ if (!p_slot) return false; From e852cb1d00ceb4b0156832c13ba3daf7ed93ac17 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:44 +0000 Subject: [PATCH 082/215] maple_tree: clear request_count for new allocated one Patch series "maple_tree: simplify mas_push_node()", v2. When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. This patch (of 3): If this is not a new allocated one, the request_count has already been cleared in mas_set_alloc_req(). Link: https://lkml.kernel.org/r/20241015120746.15850-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241015120746.15850-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index c2d3c8d273584..ee7922b19f0a2 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1265,11 +1265,11 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) mas->alloc = node; node->total = ++allocated; + node->request_count = 0; requested--; } node = mas->alloc; - node->request_count = 0; while (requested) { max_req = MAPLE_ALLOC_SLOTS - node->node_count; slots = (void **)&node->slot[node->node_count]; From 4223dd93bfc976debededffc0b03cc63d9b73d14 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:45 +0000 Subject: [PATCH 083/215] maple_tree: total is not changed for nomem_one case If it jumps to nomem_one, the total allocated number is not changed. So we don't need to adjust it. For the nomem_bulk case, we know there is a valid mas->alloc. So we don't need to do the check. Link: https://lkml.kernel.org/r/20241015120746.15850-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index ee7922b19f0a2..1b80201865d86 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1297,10 +1297,9 @@ static inline void mas_alloc_nodes(struct ma_state *mas, gfp_t gfp) nomem_bulk: /* Clean up potential freed allocations on bulk failure */ memset(slots, 0, max_req * sizeof(unsigned long)); + mas->alloc->total = allocated; nomem_one: mas_set_alloc_req(mas, requested); - if (mas->alloc && !(((unsigned long)mas->alloc & 0x1))) - mas->alloc->total = allocated; mas_set_err(mas, -ENOMEM); } From 908378a30b0972e5bf8fae3cf38affc162fe8e3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 15 Oct 2024 12:07:46 +0000 Subject: [PATCH 084/215] maple_tree: simplify mas_push_node() When count is not 0, we know head is valid. So we can put the assignment in if (count) instead of checking the head pointer again. Also count represents current total, we can assign the new total by increasing the count by one. Link: https://lkml.kernel.org/r/20241015120746.15850-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 1b80201865d86..667120a445709 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -1207,19 +1207,17 @@ static inline void mas_push_node(struct ma_state *mas, struct maple_node *used) reuse->request_count = 0; reuse->node_count = 0; - if (count && (head->node_count < MAPLE_ALLOC_SLOTS)) { - head->slot[head->node_count++] = reuse; - head->total++; - goto done; - } - - reuse->total = 1; - if ((head) && !((unsigned long)head & 0x1)) { + if (count) { + if (head->node_count < MAPLE_ALLOC_SLOTS) { + head->slot[head->node_count++] = reuse; + head->total++; + goto done; + } reuse->slot[0] = head; reuse->node_count = 1; - reuse->total += head->total; } + reuse->total = count + 1; mas->alloc = reuse; done: if (requested > 1) From e4137f08816bbf91fe76d1b60fa16862a4827ac1 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Fri, 11 Oct 2024 08:53:10 +0500 Subject: [PATCH 085/215] mm, kasan, kmsan: instrument copy_from/to_kernel_nofault Instrument copy_from_kernel_nofault() with KMSAN for uninitialized kernel memory check and copy_to_kernel_nofault() with KASAN, KCSAN to detect the memory corruption. syzbot reported that bpf_probe_read_kernel() kernel helper triggered KASAN report via kasan_check_range() which is not the expected behaviour as copy_from_kernel_nofault() is meant to be a non-faulting helper. Solution is, suggested by Marco Elver, to replace KASAN, KCSAN check in copy_from_kernel_nofault() with KMSAN detection of copying uninitilaized kernel memory. In copy_to_kernel_nofault() we can retain instrument_write() explicitly for the memory corruption instrumentation. copy_to_kernel_nofault() is tested on x86_64 and arm64 with CONFIG_KASAN_SW_TAGS. On arm64 with CONFIG_KASAN_HW_TAGS, kunit test currently fails. Need more clarification on it. [akpm@linux-foundation.org: fix comment layout, per checkpatch Link: https://lore.kernel.org/linux-mm/CANpmjNMAVFzqnCZhEity9cjiqQ9CVN1X7qeeeAp_6yKjwKo8iw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20241011035310.2982017-1-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Marco Elver Reported-by: syzbot+61123a5daeb9f7454599@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=61123a5daeb9f7454599 Reported-by: Andrey Konovalov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=210505 Reviewed-by: Andrey Konovalov [KASAN] Tested-by: Andrey Konovalov [KASAN] Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 36 ++++++++++++++++++++++++++++++++++++ mm/kmsan/kmsan_test.c | 17 +++++++++++++++++ mm/maccess.c | 10 ++++++++-- 3 files changed, 61 insertions(+), 2 deletions(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index d8fb281e439d5..fe132ce3c2b34 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1928,6 +1928,41 @@ static void rust_uaf(struct kunit *test) KUNIT_EXPECT_KASAN_FAIL(test, kasan_test_rust_uaf()); } +static void copy_to_kernel_nofault_oob(struct kunit *test) +{ + char *ptr; + char buf[128]; + size_t size = sizeof(buf); + + /* + * This test currently fails with the HW_TAGS mode. The reason is + * unknown and needs to be investigated. + */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_HW_TAGS); + + ptr = kmalloc(size - KASAN_GRANULE_SIZE, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + OPTIMIZER_HIDE_VAR(ptr); + + /* + * We test copy_to_kernel_nofault() to detect corrupted memory that is + * being written into the kernel. In contrast, + * copy_from_kernel_nofault() is primarily used in kernel helper + * functions where the source address might be random or uninitialized. + * Applying KASAN instrumentation to copy_from_kernel_nofault() could + * lead to false positives. By focusing KASAN checks only on + * copy_to_kernel_nofault(), we ensure that only valid memory is + * written to the kernel, minimizing the risk of kernel corruption + * while avoiding false positives in the reverse case. + */ + KUNIT_EXPECT_KASAN_FAIL(test, + copy_to_kernel_nofault(&buf[0], ptr, size)); + KUNIT_EXPECT_KASAN_FAIL(test, + copy_to_kernel_nofault(ptr, &buf[0], size)); + + kfree(ptr); +} + static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_right), KUNIT_CASE(kmalloc_oob_left), @@ -2000,6 +2035,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(match_all_not_assigned), KUNIT_CASE(match_all_ptr_tag), KUNIT_CASE(match_all_mem_tag), + KUNIT_CASE(copy_to_kernel_nofault_oob), KUNIT_CASE(rust_uaf), {} }; diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c index 13236d579ebaa..9733a22c46c1d 100644 --- a/mm/kmsan/kmsan_test.c +++ b/mm/kmsan/kmsan_test.c @@ -640,6 +640,22 @@ static void test_unpoison_memory(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static void test_copy_from_kernel_nofault(struct kunit *test) +{ + long ret; + char buf[4], src[4]; + size_t size = sizeof(buf); + + EXPECTATION_UNINIT_VALUE_FN(expect, "copy_from_kernel_nofault"); + kunit_info( + test, + "testing copy_from_kernel_nofault with uninitialized memory\n"); + + ret = copy_from_kernel_nofault((char *)&buf[0], (char *)&src[0], size); + USE(ret); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_uninit_kmalloc), KUNIT_CASE(test_init_kmalloc), @@ -664,6 +680,7 @@ static struct kunit_case kmsan_test_cases[] = { KUNIT_CASE(test_long_origin_chain), KUNIT_CASE(test_stackdepot_roundtrip), KUNIT_CASE(test_unpoison_memory), + KUNIT_CASE(test_copy_from_kernel_nofault), {}, }; diff --git a/mm/maccess.c b/mm/maccess.c index 518a25667323e..3ca55ec63a6aa 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -13,9 +13,14 @@ bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src, return true; } +/* + * The below only uses kmsan_check_memory() to ensure uninitialized kernel + * memory isn't leaked. + */ #define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ - __get_kernel_nofault(dst, src, type, err_label); \ + __get_kernel_nofault(dst, src, type, err_label); \ + kmsan_check_memory(src, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ @@ -49,7 +54,8 @@ EXPORT_SYMBOL_GPL(copy_from_kernel_nofault); #define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \ while (len >= sizeof(type)) { \ - __put_kernel_nofault(dst, src, type, err_label); \ + __put_kernel_nofault(dst, src, type, err_label); \ + instrument_write(dst, sizeof(type)); \ dst += sizeof(type); \ src += sizeof(type); \ len -= sizeof(type); \ From 6c2625e9c2efef5272e1addf6007a1fcab1f059b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Oct 2024 21:23:51 +0300 Subject: [PATCH 086/215] x86/percpu: fix clang warning when dealing with unsigned types Patch series "percpu: Add a test case and fix for clang", v2. Add a test case to percpu to check a corner case with the specific 64-bit unsigned value. This test case shows why the first patch is done in the way it's done. The before and after has been tested with binary comparison of the percpu_test module and runnig it on the real Intel system. This patch (of 2): When percpu_add_op() is used with an unsigned argument, it prevents kernel builds with clang, `make W=1` and CONFIG_WERROR=y: net/ipv4/tcp_output.c:187:3: error: result of comparison of constant -1 with expression of type 'u8' (aka 'unsigned char') is always false [-Werror,-Wtautological-constant-out-of-range-compare] 187 | NET_ADD_STATS(sock_net(sk), LINUX_MIB_TCPACKCOMPRESSED, | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 188 | tp->compressed_ack); | ~~~~~~~~~~~~~~~~~~~ ... arch/x86/include/asm/percpu.h:238:31: note: expanded from macro 'percpu_add_op' 238 | ((val) == 1 || (val) == -1)) ? \ | ~~~~~ ^ ~~ Fix this by casting -1 to the type of the parameter and then compare. Link: https://lkml.kernel.org/r/20241016182635.1156168-1-andriy.shevchenko@linux.intel.com Link: https://lkml.kernel.org/r/20241016182635.1156168-2-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Borislav Petkov (AMD) Cc: Christoph Lameter Cc: Dave Hansen Cc: Dennis Zhou Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uros Bizjak Signed-off-by: Andrew Morton --- arch/x86/include/asm/percpu.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index c55a79d5feaeb..e525cd85f999f 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -234,9 +234,10 @@ do { \ */ #define percpu_add_op(size, qual, var, val) \ do { \ - const int pao_ID__ = (__builtin_constant_p(val) && \ - ((val) == 1 || (val) == -1)) ? \ - (int)(val) : 0; \ + const int pao_ID__ = \ + (__builtin_constant_p(val) && \ + ((val) == 1 || \ + (val) == (typeof(val))-1)) ? (int)(val) : 0; \ \ if (0) { \ typeof(var) pao_tmp__; \ From 4a7bba1df00163ecbdf4994bc42b879ade4aeed2 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 16 Oct 2024 21:23:52 +0300 Subject: [PATCH 087/215] percpu: add a test case for the specific 64-bit value addition It might be a corner case when we add UINT_MAX as 64-bit unsigned value to the percpu variable as it's not the same as -1 (ULONG_LONG_MAX). Add a test case for that. Link: https://lkml.kernel.org/r/20241016182635.1156168-3-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Borislav Petkov (AMD) Cc: Christoph Lameter Cc: Dave Hansen Cc: Dennis Zhou Cc: "H. Peter Anvin" Cc: Ingo Molnar Cc: Ingo Molnar Cc: Tejun Heo Cc: Thomas Gleixner Cc: Uros Bizjak Signed-off-by: Andrew Morton --- lib/percpu_test.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/percpu_test.c b/lib/percpu_test.c index 4a3d70bbc1a08..ce7124b16dabc 100644 --- a/lib/percpu_test.c +++ b/lib/percpu_test.c @@ -1,4 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include /* validate @native and @pcp counter values match @expected */ @@ -24,8 +25,9 @@ static int __init percpu_test_init(void) * +ul_one/-ul_one below would replace with inc/dec instructions. */ volatile unsigned int ui_one = 1; - long l = 0; + unsigned long long ull = 0; unsigned long ul = 0; + long l = 0; pr_info("percpu test start\n"); @@ -112,6 +114,13 @@ static int __init percpu_test_init(void) CHECK(ul, ulong_counter, -1); CHECK(ul, ulong_counter, ULONG_MAX); + ul = ull = 0; + __this_cpu_write(ulong_counter, 0); + + ul = ull += UINT_MAX; + __this_cpu_add(ulong_counter, ull); + CHECK(ul, ulong_counter, UINT_MAX); + ul = 3; __this_cpu_write(ulong_counter, 3); From d3ea85c6c5f70acff970f3339afb2da8f9a805a6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Wed, 16 Oct 2024 16:10:41 +0200 Subject: [PATCH 088/215] mm: swap: use str_true_false() helper function Remove hard-coded strings by using the helper function str_true_false(). Link: https://lkml.kernel.org/r/20241016141040.79168-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/swap_state.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4669f29cf5557..e0c0321b8ff71 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -889,8 +889,7 @@ struct folio *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, static ssize_t vma_ra_enabled_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sysfs_emit(buf, "%s\n", - enable_vma_readahead ? "true" : "false"); + return sysfs_emit(buf, "%s\n", str_true_false(enable_vma_readahead)); } static ssize_t vma_ra_enabled_store(struct kobject *kobj, struct kobj_attribute *attr, From f1001f3d3b6868998cab73d10fda1a5c99ddf963 Mon Sep 17 00:00:00 2001 From: Wei Xu Date: Thu, 17 Oct 2024 18:15:28 +0000 Subject: [PATCH 089/215] mm/mglru: reset page lru tier bits when activating When a folio is activated, lru_gen_add_folio() moves the folio to the youngest generation. But unlike folio_update_gen()/folio_inc_gen(), lru_gen_add_folio() doesn't reset the folio lru tier bits (LRU_REFS_MASK | LRU_REFS_FLAGS). This inconsistency can affect how pages are aged via folio_mark_accessed() (e.g. fd accesses), though no user visible impact related to this has been detected yet. Note that lru_gen_add_folio() cannot clear PG_workingset if the activation is due to workingset refault, otherwise PSI accounting will be skipped. So fix lru_gen_add_folio() to clear the lru tier bits other than PG_workingset when activating a folio, and also clear all the lru tier bits when a folio is activated via folio_activate() in lru_gen_look_around(). Link: https://lkml.kernel.org/r/20241017181528.3358821-1-weixugc@google.com Fixes: 018ee47f1489 ("mm: multi-gen LRU: exploit locality in rmap") Signed-off-by: Wei Xu Cc: Axel Rasmussen Cc: Brian Geffon Cc: Jan Alexander Steffens Cc: Suleiman Souhlal Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 15 ++++++++++++++- include/linux/mmzone.h | 2 ++ mm/vmscan.c | 8 ++++---- 3 files changed, 20 insertions(+), 5 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 6f801c7b36e2f..355cf46a01a61 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -155,6 +155,11 @@ static inline int folio_lru_refs(struct folio *folio) return ((flags & LRU_REFS_MASK) >> LRU_REFS_PGOFF) + workingset; } +static inline void folio_clear_lru_refs(struct folio *folio) +{ + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); +} + static inline int folio_lru_gen(struct folio *folio) { unsigned long flags = READ_ONCE(folio->flags); @@ -222,6 +227,7 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, { unsigned long seq; unsigned long flags; + unsigned long mask; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -257,7 +263,14 @@ static inline bool lru_gen_add_folio(struct lruvec *lruvec, struct folio *folio, gen = lru_gen_from_seq(seq); flags = (gen + 1UL) << LRU_GEN_PGOFF; /* see the comment on MIN_NR_GENS about PG_active */ - set_mask_bits(&folio->flags, LRU_GEN_MASK | BIT(PG_active), flags); + mask = LRU_GEN_MASK; + /* + * Don't clear PG_workingset here because it can affect PSI accounting + * if the activation is due to workingset refault. + */ + if (folio_test_active(folio)) + mask |= LRU_REFS_MASK | BIT(PG_referenced) | BIT(PG_active); + set_mask_bits(&folio->flags, mask, flags); lru_gen_update_size(lruvec, folio, -1, gen); /* for folio_rotate_reclaimable() */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5b1c984daf454..2e8c4307c7284 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -403,6 +403,8 @@ enum { NR_LRU_GEN_CAPS }; +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + #define MIN_LRU_BATCH BITS_PER_LONG #define MAX_LRU_BATCH (MIN_LRU_BATCH * 64) diff --git a/mm/vmscan.c b/mm/vmscan.c index 5bec29914f122..8d1301c0f22a1 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2603,8 +2603,6 @@ static bool should_clear_pmd_young(void) * shorthand helpers ******************************************************************************/ -#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) - #define DEFINE_MAX_SEQ(lruvec) \ unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) @@ -4142,8 +4140,10 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) old_gen = folio_lru_gen(folio); if (old_gen < 0) folio_set_referenced(folio); - else if (old_gen != new_gen) + else if (old_gen != new_gen) { + folio_clear_lru_refs(folio); folio_activate(folio); + } } arch_leave_lazy_mmu_mode(); @@ -4376,7 +4376,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca /* see the comment on MAX_NR_TIERS */ if (!folio_test_referenced(folio)) - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + folio_clear_lru_refs(folio); /* for shrink_folio_list() */ folio_clear_reclaim(folio); From 7146de5ff504003ed6f61c39c379b5777e7bed29 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Thu, 17 Oct 2024 17:56:38 +0100 Subject: [PATCH 090/215] tools: testing: fix phys_addr_t size on 64-bit systems The phys_addr_t size is predicated on whether CONFIG_PHYS_ADDR_T_64BIT is set or not. In the VMA tests, virt_to_phys() from tools/include/linux casts a volatile void * pointer to phys_addr_t, if CONFIG_PHYS_ADDR_T_64BIT is not set, this will be 32-bit and trigger a warning. Obviously this might also lead to truncation, which we would rather avoid. Fix this by adjusting the generation of generated/bit-length.h to generate a CONFIG_PHYS_ADDR_T{bits}BIT define. This does result in the generation of the useless CONFIG_PHYS_ADDR_T_32BIT define for 32-bit systems, but this should have no effect, and makes implementation of this easier. This resolves the issue and the warning. [lorenzo.stoakes@oracle.com: VMA tests not properly importing bit-length.h] Link: https://lkml.kernel.org/r/a6183df9-3108-4d59-8128-4fc6c14e22a5@lucifer.local Link: https://lkml.kernel.org/r/20241017165638.95602-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Tested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Cc: Jann Horn Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/shared/shared.mk | 1 + tools/testing/vma/vma.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/tools/testing/shared/shared.mk b/tools/testing/shared/shared.mk index a6bc51d0b0bfb..923ee2492256b 100644 --- a/tools/testing/shared/shared.mk +++ b/tools/testing/shared/shared.mk @@ -69,6 +69,7 @@ generated/bit-length.h: FORCE @if ! grep -qws CONFIG_$(LONG_BIT)BIT generated/bit-length.h; then \ echo "Generating $@"; \ echo "#define CONFIG_$(LONG_BIT)BIT 1" > $@; \ + echo "#define CONFIG_PHYS_ADDR_T_$(LONG_BIT)BIT 1" >> $@; \ fi FORCE: ; diff --git a/tools/testing/vma/vma.c b/tools/testing/vma/vma.c index b33b47342d418..8fab5e13c7c3b 100644 --- a/tools/testing/vma/vma.c +++ b/tools/testing/vma/vma.c @@ -4,6 +4,8 @@ #include #include +#include "generated/bit-length.h" + #include "maple-shared.h" #include "vma_internal.h" From 5a90c155defa684f3a21f68c3f8e40c056e6114c Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:17:42 +0800 Subject: [PATCH 091/215] tmpfs: don't enable large folios if not supported tmpfs can support large folios, but there are some configurable options (mount options and runtime deny/force) to enable/disable large folio allocation, so there is a performance issue when performing writes without large folios. The issue is similar to commit 4e527d5841e2 ("iomap: fault in smaller chunks for non-large folio mappings"). Since 'deny' is for emergencies and 'force' is for testing, performance issues should not be a problem in real production environments, so don't call mapping_set_large_folios() in __shmem_get_inode() when large folio is disabled with mount huge=never option (default policy). Link: https://lkml.kernel.org/r/20241017141742.1169404-1-wangkefeng.wang@huawei.com Fixes: 9aac777aaf94 ("filemap: Convert generic_perform_write() to support large folios") Signed-off-by: Kefeng Wang Cc: Alexander Viro Cc: Baolin Wang Cc: Christian Brauner Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 6ad50ba60d8ef..98fb539434f49 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2842,7 +2842,10 @@ static struct inode *__shmem_get_inode(struct mnt_idmap *idmap, cache_no_acl(inode); if (sbinfo->noswap) mapping_set_unevictable(inode->i_mapping); - mapping_set_large_folios(inode->i_mapping); + + /* Don't consider 'deny' for emergencies and 'force' for testing */ + if (sbinfo->huge) + mapping_set_large_folios(inode->i_mapping); switch (mode & S_IFMT) { default: From 9884efd795cc2f71ef3b7f42df32420b0b7ce34f Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:14:56 +0800 Subject: [PATCH 092/215] mm: huge_memory: move file_thp_enabled() into huge_memory.c file_thp_enabled() is only used in __thp_vma_allowable_orders(), so move it into huge_memory.c, also check READ_ONLY_THP_FOR_FS ahead to avoid unnecessary code if config disabled. Link: https://lkml.kernel.org/r/20241017141457.1169092-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: David Hildenbrand Reviewed-by: Baolin Wang Cc: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 13 ------------- mm/huge_memory.c | 15 +++++++++++++++ 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 8afe09a2cf03b..006f730545c2a 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -252,19 +252,6 @@ static inline unsigned long thp_vma_suitable_orders(struct vm_area_struct *vma, return orders; } -static inline bool file_thp_enabled(struct vm_area_struct *vma) -{ - struct inode *inode; - - if (!vma->vm_file) - return false; - - inode = vma->vm_file->f_inode; - - return (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) && - !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); -} - unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 73194aa0544ce..492c16eaf147a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -83,6 +83,21 @@ unsigned long huge_anon_orders_madvise __read_mostly; unsigned long huge_anon_orders_inherit __read_mostly; static bool anon_orders_configured __initdata; +static inline bool file_thp_enabled(struct vm_area_struct *vma) +{ + struct inode *inode; + + if (!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS)) + return false; + + if (!vma->vm_file) + return false; + + inode = file_inode(vma->vm_file); + + return !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); +} + unsigned long __thp_vma_allowable_orders(struct vm_area_struct *vma, unsigned long vm_flags, unsigned long tva_flags, From 4a9a27fdf7bfd29013491aea45e3512988cc5876 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Thu, 17 Oct 2024 22:14:57 +0800 Subject: [PATCH 093/215] mm: shmem: remove __shmem_huge_global_enabled() Remove __shmem_huge_global_enabled() since it as only one caller, and remove repeated check of VM_NOHUGEPAGE/MMF_DISABLE_THP as they are checked in shmem_allowable_huge_orders(), also remove unnecessary vma parameter. Link: https://lkml.kernel.org/r/20241017141457.1169092-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Barry Song Cc: Hugh Dickins Cc: Matthew Wilcox Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/shmem.c | 33 ++++++++++----------------------- 1 file changed, 10 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 98fb539434f49..ebf39aa0b9ab5 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -548,17 +548,15 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, - unsigned long vm_flags) +static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, + loff_t write_end, bool shmem_huge_force, + unsigned long vm_flags) { - struct mm_struct *mm = vma ? vma->vm_mm : NULL; loff_t i_size; - if (!S_ISREG(inode->i_mode)) + if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) return false; - if (mm && ((vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &mm->flags))) + if (!S_ISREG(inode->i_mode)) return false; if (shmem_huge == SHMEM_HUGE_DENY) return false; @@ -576,7 +574,7 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, return true; fallthrough; case SHMEM_HUGE_ADVISE: - if (mm && (vm_flags & VM_HUGEPAGE)) + if (vm_flags & VM_HUGEPAGE) return true; fallthrough; default: @@ -584,17 +582,6 @@ static bool __shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } } -static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, unsigned long vm_flags) -{ - if (HPAGE_PMD_ORDER > MAX_PAGECACHE_ORDER) - return false; - - return __shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vma, vm_flags); -} - #if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { @@ -772,8 +759,8 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, - loff_t write_end, bool shmem_huge_force, - struct vm_area_struct *vma, unsigned long vm_flags) + loff_t write_end, bool shmem_huge_force, + unsigned long vm_flags) { return false; } @@ -1170,7 +1157,7 @@ static int shmem_getattr(struct mnt_idmap *idmap, generic_fillattr(idmap, request_mask, inode, stat); inode_unlock_shared(inode); - if (shmem_huge_global_enabled(inode, 0, 0, false, NULL, 0)) + if (shmem_huge_global_enabled(inode, 0, 0, false, 0)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1687,7 +1674,7 @@ unsigned long shmem_allowable_huge_orders(struct inode *inode, return 0; global_huge = shmem_huge_global_enabled(inode, index, write_end, - shmem_huge_force, vma, vm_flags); + shmem_huge_force, vm_flags); if (!vma || !vma_is_anon_shmem(vma)) { /* * For tmpfs, we now only support PMD sized THP if huge page From 0938b1614648d5fbd832449a5a8a1b51d985323d Mon Sep 17 00:00:00 2001 From: Pankaj Raghav Date: Thu, 17 Oct 2024 08:23:42 +0200 Subject: [PATCH 094/215] mm: don't set readahead flag on a folio when lookahead_size > nr_to_read The readahead flag is set on a folio based on the lookahead_size and nr_to_read. For example, when the readahead happens from index to index + nr_to_read, then the readahead `mark` offset from index is set at nr_to_read - lookahead_size. There are some scenarios where the lookahead_size > nr_to_read. For example, readahead window was created, but the file was truncated before the readahead starts. do_page_cache_ra() will clamp the nr_to_read if the readahead window extends beyond EOF after truncation. If this happens, readahead flag should not be set on any folio on the current readahead window. The current calculation for `mark` with mapping_min_order > 0 gives incorrect results when lookahead_size > nr_to_read due to rounding up operation: index = 128 nr_to_read = 16 lookahead_size = 28 mapping_min_order = 4 (16 pages) ra_folio_index = round_up(128 + 16 - 28, 16) = 128; mark = 128 - 128 = 0; # offset from index to set RA flag In the above example, the lookahead_size is actually lying outside the current readahead window. Without this patch, RA flag will be set incorrectly on the folio at index 128. This can lead to marking the readahead flag on the wrong folio, therefore, triggering a readahead when it is not necessary. Explicitly initialize `mark` to be ULONG_MAX and only calculate it when lookahead_size is within the readahead window. Link: https://lkml.kernel.org/r/20241017062342.478973-1-kernel@pankajraghav.com Fixes: 26cfdb395eef ("readahead: allocate folios with mapping_min_order in readahead") Signed-off-by: Pankaj Raghav Cc: Luis Chamberlain Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/readahead.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/mm/readahead.c b/mm/readahead.c index 3dc6c7a128dd3..475d2940a1edb 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -206,9 +206,9 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, unsigned long nr_to_read, unsigned long lookahead_size) { struct address_space *mapping = ractl->mapping; - unsigned long ra_folio_index, index = readahead_index(ractl); + unsigned long index = readahead_index(ractl); gfp_t gfp_mask = readahead_gfp_mask(mapping); - unsigned long mark, i = 0; + unsigned long mark = ULONG_MAX, i = 0; unsigned int min_nrpages = mapping_min_folio_nrpages(mapping); /* @@ -232,9 +232,14 @@ void page_cache_ra_unbounded(struct readahead_control *ractl, * index that only has lookahead or "async_region" to set the * readahead flag. */ - ra_folio_index = round_up(readahead_index(ractl) + nr_to_read - lookahead_size, - min_nrpages); - mark = ra_folio_index - index; + if (lookahead_size <= nr_to_read) { + unsigned long ra_folio_index; + + ra_folio_index = round_up(readahead_index(ractl) + + nr_to_read - lookahead_size, + min_nrpages); + mark = ra_folio_index - index; + } nr_to_read += readahead_index(ractl) - index; ractl->_index = index; From 61e9df7085cca6b62e9d230ed807eb524126a105 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:08 +0000 Subject: [PATCH 095/215] maple_tree: calculate new_end when needed Patch series "Following cleanup after introduce mas_wr_store_type()", v2. Patch 1 postpone new_end calculation when needed. Patch 2 removes a unnecessary sanity check in mas_wr_slot_store(). This patch (of 2): For wr_exact_fit/wr_new_root, we don't need to calculate new_end. Let's postpone it until necessary. Link: https://lkml.kernel.org/r/20241017015809.23392-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241017015809.23392-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 667120a445709..bc30e99d6cf0c 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -4211,13 +4211,13 @@ static inline enum store_type mas_wr_store_type(struct ma_wr_state *wr_mas) if (!wr_mas->entry) mas_wr_extend_null(wr_mas); - new_end = mas_wr_new_end(wr_mas); if ((wr_mas->r_min == mas->index) && (wr_mas->r_max == mas->last)) return wr_exact_fit; if (unlikely(!mas->index && mas->last == ULONG_MAX)) return wr_new_root; + new_end = mas_wr_new_end(wr_mas); /* Potential spanning rebalance collapsing a node */ if (new_end < mt_min_slots[wr_mas->type]) { if (!mte_is_root(mas->node) && !(mas->mas_flags & MA_STATE_BULK)) From 38dc8f495246667b543de4cc646fce2925e4cf3b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 17 Oct 2024 01:58:09 +0000 Subject: [PATCH 096/215] maple_tree: remove sanity check from mas_wr_slot_store() After commit 5d659bbb52a2 ("maple_tree: introduce mas_wr_store_type()"), the check here is redundant. Let's remove it. Link: https://lkml.kernel.org/r/20241017015809.23392-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Sidhartha Kumar Reviewed-by: Liam R. Howlett Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index bc30e99d6cf0c..38aa8abf8eb81 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3897,7 +3897,8 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; mas->offset++; /* Keep mas accurate. */ } - } else if (!mt_in_rcu(mas->tree)) { + } else { + WARN_ON_ONCE(mt_in_rcu(mas->tree)); /* * Expand the range, only partially overwriting the previous and * next ranges @@ -3907,8 +3908,6 @@ static inline void mas_wr_slot_store(struct ma_wr_state *wr_mas) wr_mas->pivots[offset] = mas->index - 1; wr_mas->pivots[offset + 1] = mas->last; mas->offset++; /* Keep mas accurate. */ - } else { - return; } trace_ma_write(__func__, mas, 0, wr_mas->entry); From 58f1069311db63ed9f330fdba4418a13ab49d843 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Oct 2024 13:41:13 -0400 Subject: [PATCH 097/215] mm/mremap: cleanup vma_to_resize() Patch series "mm/mremap: Remove extra vma tree walk", v2. An extra vma tree walk was discovered in some mremap call paths during the discussion on mseal() changes. This patch set removes the extra vma tree walk and further cleans up mremap_to(). This patch (of 2): vma_to_resize() is used in two locations to find and validate the vma for the mremap location. One of the two locations already has the vma, which is then re-found to validate the same vma. This code can be simplified by moving the vma_lookup() from vma_to_resize() to mremap_to() and changing the return type to an int error. Since the function now just validates the vma, the function is renamed to resize_is_valid() to better reflect what it is doing. This commit also adds documentation about the function. Link: https://lkml.kernel.org/r/20241018174114.2871880-1-Liam.Howlett@oracle.com Link: https://lkml.kernel.org/r/20241018174114.2871880-2-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Pedro Falcato Reviewed-by: Lorenzo Stoakes Cc: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kefeng Wang Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/mremap.c | 53 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 22 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 5917feafe8cc5..e781ec4573ca3 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -826,17 +826,24 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } -static struct vm_area_struct *vma_to_resize(unsigned long addr, +/* + * resize_is_valid() - Ensure the vma can be resized to the new length at the give + * address. + * + * @vma: The vma to resize + * @addr: The old address + * @old_len: The current size + * @new_len: The desired size + * @flags: The vma flags + * + * Return 0 on success, error otherwise. + */ +static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned long pgoff; - vma = vma_lookup(mm, addr); - if (!vma) - return ERR_PTR(-EFAULT); - /* * !old_len is a special case where an attempt is made to 'duplicate' * a mapping. This makes no sense for private mappings as it will @@ -847,37 +854,37 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr, */ if (!old_len && !(vma->vm_flags & (VM_SHARED | VM_MAYSHARE))) { pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid); - return ERR_PTR(-EINVAL); + return -EINVAL; } if ((flags & MREMAP_DONTUNMAP) && (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP))) - return ERR_PTR(-EINVAL); + return -EINVAL; /* We can't remap across vm area boundaries */ if (old_len > vma->vm_end - addr) - return ERR_PTR(-EFAULT); + return -EFAULT; if (new_len == old_len) - return vma; + return 0; /* Need to be careful about a growing mapping */ pgoff = (addr - vma->vm_start) >> PAGE_SHIFT; pgoff += vma->vm_pgoff; if (pgoff + (new_len >> PAGE_SHIFT) < pgoff) - return ERR_PTR(-EINVAL); + return -EINVAL; if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) - return ERR_PTR(-EFAULT); + return -EFAULT; if (!mlock_future_ok(mm, vma->vm_flags, new_len - old_len)) - return ERR_PTR(-EAGAIN); + return -EAGAIN; if (!may_expand_vm(mm, vma->vm_flags, (new_len - old_len) >> PAGE_SHIFT)) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - return vma; + return 0; } static unsigned long mremap_to(unsigned long addr, unsigned long old_len, @@ -936,12 +943,16 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, old_len = new_len; } - vma = vma_to_resize(addr, old_len, new_len, flags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); + vma = vma_lookup(mm, addr); + if (!vma) { + ret = -EFAULT; goto out; } + ret = resize_is_valid(vma, addr, old_len, new_len, flags); + if (ret) + goto out; + /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (flags & MREMAP_DONTUNMAP && !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { @@ -1114,11 +1125,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Ok, we need to grow.. */ - vma = vma_to_resize(addr, old_len, new_len, flags); - if (IS_ERR(vma)) { - ret = PTR_ERR(vma); + ret = resize_is_valid(vma, addr, old_len, new_len, flags); + if (ret) goto out; - } /* old_len exactly to the end of the area.. */ From 4b6b0a5188c219cf40d6e863e55e2a5ca39e51cd Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Fri, 18 Oct 2024 13:41:14 -0400 Subject: [PATCH 098/215] mm/mremap: remove goto from mremap_to() mremap_to() has a goto label at the end that doesn't unwind anything. Removing the label makes the code cleaner. This commit also adds documentation to the function. Link: https://lkml.kernel.org/r/20241018174114.2871880-3-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Pedro Falcato Cc: David Hildenbrand Cc: Jann Horn Cc: Jeff Xu Cc: Kefeng Wang Cc: Lorenzo Stoakes Cc: Qi Zheng Signed-off-by: Andrew Morton --- mm/mremap.c | 46 +++++++++++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e781ec4573ca3..4c79ab92eb8f5 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -887,6 +887,20 @@ static int resize_is_valid(struct vm_area_struct *vma, unsigned long addr, return 0; } +/* + * mremap_to() - remap a vma to a new location + * @addr: The old address + * @old_len: The old size + * @new_addr: The target address + * @new_len: The new size + * @locked: If the returned vma is locked (VM_LOCKED) + * @flags: the mremap flags + * @uf: The mremap userfaultfd context + * @uf_unmap_early: The userfaultfd unmap early context + * @uf_unmap: The userfaultfd unmap context + * + * Returns: The new address of the vma or an error. + */ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, unsigned long new_addr, unsigned long new_len, bool *locked, unsigned long flags, struct vm_userfaultfd_ctx *uf, @@ -895,18 +909,18 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long ret = -EINVAL; + unsigned long ret; unsigned long map_flags = 0; if (offset_in_page(new_addr)) - goto out; + return -EINVAL; if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; + return -EINVAL; /* Ensure the old/new locations do not overlap */ if (addr + old_len > new_addr && new_addr + new_len > addr) - goto out; + return -EINVAL; /* * move_vma() need us to stay 4 maps below the threshold, otherwise @@ -933,31 +947,28 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, */ ret = do_munmap(mm, new_addr, new_len, uf_unmap_early); if (ret) - goto out; + return ret; } if (old_len > new_len) { ret = do_munmap(mm, addr+new_len, old_len - new_len, uf_unmap); if (ret) - goto out; + return ret; old_len = new_len; } vma = vma_lookup(mm, addr); - if (!vma) { - ret = -EFAULT; - goto out; - } + if (!vma) + return -EFAULT; ret = resize_is_valid(vma, addr, old_len, new_len, flags); if (ret) - goto out; + return ret; /* MREMAP_DONTUNMAP expands by old_len since old_len == new_len */ if (flags & MREMAP_DONTUNMAP && !may_expand_vm(mm, vma->vm_flags, old_len >> PAGE_SHIFT)) { - ret = -ENOMEM; - goto out; + return -ENOMEM; } if (flags & MREMAP_FIXED) @@ -970,17 +981,14 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len, ((addr - vma->vm_start) >> PAGE_SHIFT), map_flags); if (IS_ERR_VALUE(ret)) - goto out; + return ret; /* We got a new mapping */ if (!(flags & MREMAP_FIXED)) new_addr = ret; - ret = move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, uf, - uf_unmap); - -out: - return ret; + return move_vma(vma, addr, old_len, new_len, new_addr, locked, flags, + uf, uf_unmap); } static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) From 5bb6345cd2edfceef1749950ce786f205e56a90b Mon Sep 17 00:00:00 2001 From: Dev Jain Date: Fri, 18 Oct 2024 15:11:51 +0530 Subject: [PATCH 099/215] mm: remove redundant condition for THP folio folio_test_pmd_mappable() implies folio_test_large(), therefore, simplify the expression for is_thp. Link: https://lkml.kernel.org/r/20241018094151.3458-1-dev.jain@arm.com Signed-off-by: Dev Jain Reviewed-by: Matthew Wilcox (Oracle) Acked-by: David Hildenbrand Reviewed-by: Zi Yan Reviewed-by: Anshuman Khandual Cc: "Huang, Ying" Signed-off-by: Andrew Morton --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/migrate.c b/mm/migrate.c index 72c6657f4f72c..dfb5eba3c5223 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1733,7 +1733,7 @@ static int migrate_pages_batch(struct list_head *from, list_for_each_entry_safe(folio, folio2, from, lru) { is_large = folio_test_large(folio); - is_thp = is_large && folio_test_pmd_mappable(folio); + is_thp = folio_test_pmd_mappable(folio); nr_pages = folio_nr_pages(folio); cond_resched(); From b7f058f827392022d8c689329f88c7b324d71dad Mon Sep 17 00:00:00 2001 From: Luoxi Li Date: Fri, 18 Oct 2024 17:22:35 +0800 Subject: [PATCH 100/215] mm: remove unused has_isolate_pageblock has_isolate_pageblock() has been unused since commit 55612e80e722 ("mm: page_alloc: close migratetype race between freeing and stealing") Remove it. Link: https://lkml.kernel.org/r/20241018092235.2764859-1-kaixa@kiloview.com Signed-off-by: Luoxi Li Acked-by: David Hildenbrand Reviewed-by: Muhammad Usama Anjum Acked-by: Johannes Weiner Reviewed-by: Anshuman Khandual Cc: Baolin Wang Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/page-isolation.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index c16db00670907..73dc2c1841ec1 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -3,10 +3,6 @@ #define __LINUX_PAGEISOLATION_H #ifdef CONFIG_MEMORY_ISOLATION -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return zone->nr_isolate_pageblock; -} static inline bool is_migrate_isolate_page(struct page *page) { return get_pageblock_migratetype(page) == MIGRATE_ISOLATE; @@ -16,10 +12,6 @@ static inline bool is_migrate_isolate(int migratetype) return migratetype == MIGRATE_ISOLATE; } #else -static inline bool has_isolate_pageblock(struct zone *zone) -{ - return false; -} static inline bool is_migrate_isolate_page(struct page *page) { return false; From f3650ef89b879d63c63f04e98481f7ed4df1119a Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 18 Oct 2024 11:00:27 +0800 Subject: [PATCH 101/215] mm: shmem: update iocb->ki_pos directly to simplify tmpfs read logic Patch series "Improve the tmpfs large folio read performance", v2. tmpfs already supports PMD-sized large folios, but the tmpfs read operation still performs copying at PAGE_SIZE granularity, which is not perfect. This patchset changes tmpfs to copy data at the folio granularity, which can improve the read performance. Use 'fio bs=64k' to read a 1G tmpfs file populated with 2M THPs, and I can see about 20% performance improvement, and no regression with bs=4k. I also did some functional testing with the xfstests suite, and I did not find any regressions with the following xfstests config: FSTYP=tmpfs export TEST_DIR=/mnt/tempfs_mnt export TEST_DEV=/mnt/tempfs_mnt export SCRATCH_MNT=/mnt/scratchdir export SCRATCH_DEV=/mnt/scratchdir This patch (of 2): Using iocb->ki_pos to check if the read bytes exceeds the file size and to calculate the bytes to be read can help simplify the code logic. Meanwhile, this is also a preparation for improving tmpfs large folios read performance in the following patch. Link: https://lkml.kernel.org/r/cover.1729218573.git.baolin.wang@linux.alibaba.com Link: https://lkml.kernel.org/r/e8863e289577e0dc1e365b5419bf2d1c9a24ae3d.1729218573.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 35 +++++++++++------------------------ 1 file changed, 11 insertions(+), 24 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index ebf39aa0b9ab5..26aaddc52fd1d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3093,27 +3093,19 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) unsigned long offset; int error = 0; ssize_t retval = 0; - loff_t *ppos = &iocb->ki_pos; - index = *ppos >> PAGE_SHIFT; - offset = *ppos & ~PAGE_MASK; + offset = iocb->ki_pos & ~PAGE_MASK; for (;;) { struct folio *folio = NULL; struct page *page = NULL; - pgoff_t end_index; unsigned long nr, ret; - loff_t i_size = i_size_read(inode); + loff_t end_offset, i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index > end_index) + if (unlikely(iocb->ki_pos >= i_size)) break; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) - break; - } + index = iocb->ki_pos >> PAGE_SHIFT; error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) @@ -3135,18 +3127,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * We must evaluate after, since reads (unlike writes) * are called without i_rwsem protection against truncate */ - nr = PAGE_SIZE; i_size = i_size_read(inode); - end_index = i_size >> PAGE_SHIFT; - if (index == end_index) { - nr = i_size & ~PAGE_MASK; - if (nr <= offset) { - if (folio) - folio_put(folio); - break; - } + if (unlikely(iocb->ki_pos >= i_size)) { + if (folio) + folio_put(folio); + break; } - nr -= offset; + end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); + nr = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset); if (folio) { /* @@ -3186,8 +3174,8 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) retval += ret; offset += ret; - index += offset >> PAGE_SHIFT; offset &= ~PAGE_MASK; + iocb->ki_pos += ret; if (!iov_iter_count(to)) break; @@ -3198,7 +3186,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) cond_resched(); } - *ppos = ((loff_t) index << PAGE_SHIFT) + offset; file_accessed(file); return retval ? retval : error; } From a284cb8472ec6bb027ebf3b936385601d8a8f414 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Fri, 18 Oct 2024 11:00:28 +0800 Subject: [PATCH 102/215] mm: shmem: improve the tmpfs large folio read performance tmpfs already supports PMD-sized large folios, but the tmpfs read operation still performs copying at PAGE_SIZE granularity, which is unreasonable. This patch changes tmpfs to copy data at folio granularity, which can improve the read performance, as well as changing to use folio related functions. Moreover, if a large folio has a subpage that is hwpoisoned, it will still fall back to page granularity copying. Use 'fio bs=64k' to read a 1G tmpfs file populated with 2M THPs, and I can see about 20% performance improvement, and no regression with bs=4k. Before the patch: READ: bw=10.0GiB/s After the patch: READ: bw=12.0GiB/s Link: https://lkml.kernel.org/r/2129a21a5b9f77d3bb7ddec152c009ce7c5653c4.1729218573.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Yang Shi Cc: David Hildenbrand Cc: Hugh Dickins Cc: Kefeng Wang Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/shmem.c | 34 ++++++++++++++++++++++++---------- 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 26aaddc52fd1d..06da05f984dab 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3094,13 +3094,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) int error = 0; ssize_t retval = 0; - offset = iocb->ki_pos & ~PAGE_MASK; - for (;;) { struct folio *folio = NULL; struct page *page = NULL; unsigned long nr, ret; loff_t end_offset, i_size = i_size_read(inode); + bool fallback_page_copy = false; + size_t fsize; if (unlikely(iocb->ki_pos >= i_size)) break; @@ -3121,6 +3121,10 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_copy = true; } /* @@ -3134,7 +3138,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } end_offset = min_t(loff_t, i_size, iocb->ki_pos + to->count); - nr = min_t(loff_t, end_offset - iocb->ki_pos, PAGE_SIZE - offset); + if (folio && likely(!fallback_page_copy)) + fsize = folio_size(folio); + else + fsize = PAGE_SIZE; + offset = iocb->ki_pos & (fsize - 1); + nr = min_t(loff_t, end_offset - iocb->ki_pos, fsize - offset); if (folio) { /* @@ -3142,10 +3151,15 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_page(page); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_copy)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } + /* - * Mark the page accessed if we read the beginning. + * Mark the folio accessed if we read the beginning. */ if (!offset) folio_mark_accessed(folio); @@ -3153,9 +3167,11 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ - ret = copy_page_to_iter(page, offset, nr, to); + if (likely(!fallback_page_copy)) + ret = copy_folio_to_iter(folio, offset, nr, to); + else + ret = copy_page_to_iter(page, offset, nr, to); folio_put(folio); - } else if (user_backed_iter(to)) { /* * Copy to user tends to be so well optimized, but @@ -3173,8 +3189,6 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) } retval += ret; - offset += ret; - offset &= ~PAGE_MASK; iocb->ki_pos += ret; if (!iov_iter_count(to)) From 78c018e3942c5dfbab7e6edb4eb784943878504b Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 7 Oct 2024 23:47:45 +0200 Subject: [PATCH 103/215] maple_tree: fix outdated flag name in comment MAPLE_USE_RCU was renamed to MT_FLAGS_USE_RCU at some point, fix up the comment. Link: https://lkml.kernel.org/r/20241007-maple-tree-doc-fix-v1-1-6bbf89c1153d@google.com Signed-off-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Wei Yang Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index c2c11004085e5..61c236850ca86 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -224,7 +224,7 @@ typedef struct { /* nothing */ } lockdep_map_p; * (set at tree creation time) and dynamic information set under the spinlock. * * Another use of flags are to indicate global states of the tree. This is the - * case with the MAPLE_USE_RCU flag, which indicates the tree is currently in + * case with the MT_FLAGS_USE_RCU flag, which indicates the tree is currently in * RCU mode. This mode was added to allow the tree to reuse nodes instead of * re-allocating and RCU freeing nodes when there is a single user. */ From ed265529d39ac408396c031a4fd7e1ef922b80d0 Mon Sep 17 00:00:00 2001 From: Sourav Panda Date: Tue, 22 Oct 2024 23:24:40 +0000 Subject: [PATCH 104/215] mm/codetag: fix arg in pgalloc_tag_copy alloc_tag_sub alloc_tag_sub() takes bytes as opposed to number of pages as argument. Currently pgalloc_tag_copy() passes the number of pages. This fix passes the correct unit, which is the number of bytes allocated. Link: https://lkml.kernel.org/r/20241022232440.334820-1-souravpanda@google.com Fixes: e0a955bf7f61 ("mm/codetag: add pgalloc_tag_copy()") Signed-off-by: Sourav Panda Acked-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Reviewed-by: Anshuman Khandual Cc: Wei Xu Cc: Yu Zhao Cc: Kent Overstreet Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 4570f33e2429a..eb070c14e3099 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4207,7 +4207,7 @@ static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) /* Clear the old ref to the original allocation tag. */ clear_page_tag_ref(&old->page); /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(ref, folio_nr_pages(new)); + alloc_tag_sub(ref, folio_size(new)); __alloc_tag_ref_set(ref, tag); From 722376934b6c0b8692f32784d7755bbe5be67529 Mon Sep 17 00:00:00 2001 From: Manas Date: Fri, 4 Oct 2024 23:12:16 +0530 Subject: [PATCH 105/215] mm/memory.c: simplify pfnmap_lockdep_assert Use local `mapping' to reduce the pointer chasing. akpm: extracted from a bugfix which Linus fixed with b1b46751671be ("mm: fix follow_pfnmap API lockdep assert"). Link: https://lkml.kernel.org/r/20241004-fix-null-deref-v4-1-d0a8ec01ac85@iiitd.ac.in Signed-off-by: Manas Reviewed-by: Peter Xu Cc: Anup Sharma Cc: Shuah Khan Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory.c b/mm/memory.c index 68e57b33363b4..2d32023d4eb87 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -6388,7 +6388,7 @@ static inline void pfnmap_lockdep_assert(struct vm_area_struct *vma) struct address_space *mapping = file ? file->f_mapping : NULL; if (mapping) - lockdep_assert(lockdep_is_held(&vma->vm_file->f_mapping->i_mmap_rwsem) || + lockdep_assert(lockdep_is_held(&mapping->i_mmap_rwsem) || lockdep_is_held(&vma->vm_mm->mmap_lock)); else lockdep_assert(lockdep_is_held(&vma->vm_mm->mmap_lock)); From 39ac99852fca98ca44d52716d792dfaf24981f53 Mon Sep 17 00:00:00 2001 From: Jim Zhao Date: Wed, 23 Oct 2024 18:00:32 +0800 Subject: [PATCH 106/215] mm/page-writeback: raise wb_thresh to prevent write blocking with strictlimit With the strictlimit flag, wb_thresh acts as a hard limit in balance_dirty_pages() and wb_position_ratio(). When device write operations are inactive, wb_thresh can drop to 0, causing writes to be blocked. The issue occasionally occurs in fuse fs, particularly with network backends, the write thread is blocked frequently during a period. To address it, this patch raises the minimum wb_thresh to a controllable level, similar to the non-strictlimit case. Link: https://lkml.kernel.org/r/20241023100032.62952-1-jimzhao.ai@gmail.com Signed-off-by: Jim Zhao Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page-writeback.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fcd4c1439cb9c..1d7179aba8e3e 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -917,7 +917,9 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, unsigned long thresh) { struct wb_domain *dom = dtc_dom(dtc); + struct bdi_writeback *wb = dtc->wb; u64 wb_thresh; + u64 wb_max_thresh; unsigned long numerator, denominator; unsigned long wb_min_ratio, wb_max_ratio; @@ -931,11 +933,28 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc, wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); - wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); + wb_min_max_ratio(wb, &wb_min_ratio, &wb_max_ratio); wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); - if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) - wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + wb_max_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); + if (wb_thresh > wb_max_thresh) + wb_thresh = wb_max_thresh; + + /* + * With strictlimit flag, the wb_thresh is treated as + * a hard limit in balance_dirty_pages() and wb_position_ratio(). + * It's possible that wb_thresh is close to zero, not because + * the device is slow, but because it has been inactive. + * To prevent occasional writes from being blocked, we raise wb_thresh. + */ + if (unlikely(wb->bdi->capabilities & BDI_CAP_STRICTLIMIT)) { + unsigned long limit = hard_dirty_limit(dom, dtc->thresh); + u64 wb_scale_thresh = 0; + + if (limit > dtc->dirty) + wb_scale_thresh = (limit - dtc->dirty) / 100; + wb_thresh = max(wb_thresh, min(wb_scale_thresh, wb_max_thresh / 4)); + } return wb_thresh; } From 3f1f947a322d2bdf0b16ff9158ce6be7cc23b974 Mon Sep 17 00:00:00 2001 From: Liu Jing Date: Tue, 22 Oct 2024 09:25:26 +0800 Subject: [PATCH 107/215] tools/mm: free the allocated memory The comm_str memory needs to be freed if the search_pattern function call fails in get_comm [akpm@linux-foundation.org: fix whitespace] Link: https://lkml.kernel.org/r/20241022012526.7597-1-liujing@cmss.chinamobile.com Signed-off-by: Liu Jing Signed-off-by: Andrew Morton --- tools/mm/page_owner_sort.c | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/mm/page_owner_sort.c b/tools/mm/page_owner_sort.c index e1f2644443429..880e36df0c118 100644 --- a/tools/mm/page_owner_sort.c +++ b/tools/mm/page_owner_sort.c @@ -377,6 +377,7 @@ static char *get_comm(char *buf) if (errno != 0) { if (debug_on) fprintf(stderr, "wrong comm in follow buf:\n%s\n", buf); + free(comm_str); return NULL; } From 628e1b8c4777941e119effc92cd395b4b02c2c5f Mon Sep 17 00:00:00 2001 From: James Houghton Date: Mon, 21 Oct 2024 16:02:12 +0000 Subject: [PATCH 108/215] mm: add missing mmu_notifier_clear_young for !MMU_NOTIFIER Remove the now unnecessary ifdef in mm/damon/vaddr.c as well. Link: https://lkml.kernel.org/r/20241021160212.9935-1-jthoughton@google.com Signed-off-by: James Houghton Reviewed-by: Jason Gunthorpe Acked-by: David Hildenbrand Reviewed-by: SeongJae Park Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- include/linux/mmu_notifier.h | 7 +++++++ mm/damon/vaddr.c | 2 -- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index d39ebb10caeb6..e2dd57ca368b0 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -606,6 +606,13 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_clear_young(struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + return 0; +} + static inline int mmu_notifier_test_young(struct mm_struct *mm, unsigned long address) { diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 08cfd22b52492..821990d0141a0 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -353,11 +353,9 @@ static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, set_huge_pte_at(mm, addr, pte, entry, psize); } -#ifdef CONFIG_MMU_NOTIFIER if (mmu_notifier_clear_young(mm, addr, addr + huge_page_size(hstate_vma(vma)))) referenced = true; -#endif /* CONFIG_MMU_NOTIFIER */ if (referenced) folio_set_young(folio); From 8717734fdcc8472174830a647d47122b4581d62a Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Mon, 21 Oct 2024 14:00:26 +0100 Subject: [PATCH 109/215] mm/memcontrol: fix seq_buf size to save memory when PAGE_SIZE is large Previously the seq_buf used for accumulating the memory.stat output was sized at PAGE_SIZE. But the amount of output is invariant to PAGE_SIZE; If 4K is enough on a 4K page system, then it should also be enough on a 64K page system, so we can save 60K on the static buffer used in mem_cgroup_print_oom_meminfo(). Let's make it so. This also has the beneficial side effect of removing a place in the code that assumed PAGE_SIZE is a compile-time constant. So this helps our quest towards supporting boot-time page size selection. Link: https://lkml.kernel.org/r/20241021130027.3615969-1-ryan.roberts@arm.com Signed-off-by: Ryan Roberts Acked-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Michal Hocko Acked-by: Roman Gushchin Acked-by: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c93ecedf7a965..5fcdd25fc1342 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -118,6 +118,7 @@ struct mem_cgroup *vmpressure_to_memcg(struct vmpressure *vmpr) return container_of(vmpr, struct mem_cgroup, vmpressure); } +#define SEQ_BUF_SIZE SZ_4K #define CURRENT_OBJCG_UPDATE_BIT 0 #define CURRENT_OBJCG_UPDATE_FLAG (1UL << CURRENT_OBJCG_UPDATE_BIT) @@ -1527,7 +1528,7 @@ void mem_cgroup_print_oom_context(struct mem_cgroup *memcg, struct task_struct * void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) { /* Use static buffer, for the caller is holding oom_lock. */ - static char buf[PAGE_SIZE]; + static char buf[SEQ_BUF_SIZE]; struct seq_buf s; lockdep_assert_held(&oom_lock); @@ -1553,7 +1554,7 @@ void mem_cgroup_print_oom_meminfo(struct mem_cgroup *memcg) pr_info("Memory cgroup stats for "); pr_cont_cgroup_path(memcg->css.cgroup); pr_cont(":"); - seq_buf_init(&s, buf, sizeof(buf)); + seq_buf_init(&s, buf, SEQ_BUF_SIZE); memory_stat_format(memcg, &s); seq_buf_do_printk(&s, KERN_INFO); } @@ -4196,12 +4197,12 @@ static int memory_events_local_show(struct seq_file *m, void *v) int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); - char *buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + char *buf = kmalloc(SEQ_BUF_SIZE, GFP_KERNEL); struct seq_buf s; if (!buf) return -ENOMEM; - seq_buf_init(&s, buf, PAGE_SIZE); + seq_buf_init(&s, buf, SEQ_BUF_SIZE); memory_stat_format(memcg, &s); seq_puts(m, buf); kfree(buf); From ab505e8be02457bb56c1902179664f2ae912c8d6 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Mon, 21 Oct 2024 11:13:40 +0200 Subject: [PATCH 110/215] mm/page_alloc: use str_off_on() helper in build_all_zonelists() Remove hard-coded strings by using the str_off_on() helper function. Link: https://lkml.kernel.org/r/20241021091340.5243-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8ad38cd5e574b..0d6301120fc41 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5493,7 +5493,7 @@ void __ref build_all_zonelists(pg_data_t *pgdat) pr_info("Built %u zonelists, mobility grouping %s. Total pages: %ld\n", nr_online_nodes, - page_group_by_mobility_disabled ? "off" : "on", + str_off_on(page_group_by_mobility_disabled), vm_total_pages); #ifdef CONFIG_NUMA pr_info("Policy zone: %s\n", zone_names[policy_zone]); From f3c7a1ede435e2e45177d7a490a85fb0a0ec96d1 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 22 Oct 2024 16:39:26 +0800 Subject: [PATCH 111/215] mm/damon/vaddr: fix issue in damon_va_evenly_split_region() Patch series "mm/damon/vaddr: Fix issue in damon_va_evenly_split_region()". v2. According to the logic of damon_va_evenly_split_region(), currently following split case would not meet the expectation: Suppose DAMON_MIN_REGION=0x1000, Case: Split [0x0, 0x3000) into 2 pieces, then the result would be acutually 3 regions: [0x0, 0x1000), [0x1000, 0x2000), [0x2000, 0x3000) but NOT the expected 2 regions: [0x0, 0x1000), [0x1000, 0x3000) !!! The root cause is that when calculating size of each split piece in damon_va_evenly_split_region(): `sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);` both the dividing and the ALIGN_DOWN may cause loss of precision, then each time split one piece of size 'sz_piece' from origin 'start' to 'end' would cause more pieces are split out than expected!!! To fix it, count for each piece split and make sure no more than 'nr_pieces'. In addition, add above case into damon_test_split_evenly(). And add 'nr_piece == 1' check in damon_va_evenly_split_region() for better code readability and add a corresponding kunit testcase. This patch (of 2): According to the logic of damon_va_evenly_split_region(), currently following split case would not meet the expectation: Suppose DAMON_MIN_REGION=0x1000, Case: Split [0x0, 0x3000) into 2 pieces, then the result would be acutually 3 regions: [0x0, 0x1000), [0x1000, 0x2000), [0x2000, 0x3000) but NOT the expected 2 regions: [0x0, 0x1000), [0x1000, 0x3000) !!! The root cause is that when calculating size of each split piece in damon_va_evenly_split_region(): `sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION);` both the dividing and the ALIGN_DOWN may cause loss of precision, then each time split one piece of size 'sz_piece' from origin 'start' to 'end' would cause more pieces are split out than expected!!! To fix it, count for each piece split and make sure no more than 'nr_pieces'. In addition, add above case into damon_test_split_evenly(). After this patch, damon-operations test passed: # ./tools/testing/kunit/kunit.py run damon-operations [...] ============== damon-operations (6 subtests) =============== [PASSED] damon_test_three_regions_in_vmas [PASSED] damon_test_apply_three_regions1 [PASSED] damon_test_apply_three_regions2 [PASSED] damon_test_apply_three_regions3 [PASSED] damon_test_apply_three_regions4 [PASSED] damon_test_split_evenly ================ [PASSED] damon-operations ================= Link: https://lkml.kernel.org/r/20241022083927.3592237-1-zhengyejian@huaweicloud.com Link: https://lkml.kernel.org/r/20241022083927.3592237-2-zhengyejian@huaweicloud.com Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: Zheng Yejian Reviewed-by: SeongJae Park Cc: Fernand Sieber Cc: Leonard Foerster Cc: Shakeel Butt Cc: Ye Weihua Cc: Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 1 + mm/damon/vaddr.c | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index 3dad8dfd9005f..fcdccb614fd85 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -300,6 +300,7 @@ static void damon_test_split_evenly(struct kunit *test) damon_test_split_evenly_fail(test, 0, 100, 0); damon_test_split_evenly_succ(test, 0, 100, 10); damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_succ(test, 0, 3, 2); damon_test_split_evenly_fail(test, 5, 6, 2); } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 821990d0141a0..86f612fbf8866 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -67,6 +67,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, unsigned long sz_orig, sz_piece, orig_end; struct damon_region *n = NULL, *next; unsigned long start; + unsigned int i; if (!r || !nr_pieces) return -EINVAL; @@ -80,8 +81,7 @@ static int damon_va_evenly_split_region(struct damon_target *t, r->ar.end = r->ar.start + sz_piece; next = damon_next_region(r); - for (start = r->ar.end; start + sz_piece <= orig_end; - start += sz_piece) { + for (start = r->ar.end, i = 1; i < nr_pieces; start += sz_piece, i++) { n = damon_new_region(start, start + sz_piece); if (!n) return -ENOMEM; From 477327e10639a1ec5698847030b494dc75de33e4 Mon Sep 17 00:00:00 2001 From: Zheng Yejian Date: Tue, 22 Oct 2024 16:39:27 +0800 Subject: [PATCH 112/215] mm/damon/vaddr: add 'nr_piece == 1' check in damon_va_evenly_split_region() As discussed in [1], damon_va_evenly_split_region() is called to size-evenly split a region into 'nr_pieces' small regions, when nr_pieces == 1, no actual split is required. Check that case for better code readability and add a simple kunit testcase. [1] https://lore.kernel.org/all/20241021163316.12443-1-sj@kernel.org/ Link: https://lkml.kernel.org/r/20241022083927.3592237-3-zhengyejian@huaweicloud.com Signed-off-by: Zheng Yejian Reviewed-by: SeongJae Park Cc: Fernand Sieber Cc: Leonard Foerster Cc: Shakeel Butt Cc: Ye Weihua Signed-off-by: Andrew Morton --- mm/damon/tests/vaddr-kunit.h | 1 + mm/damon/vaddr.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/mm/damon/tests/vaddr-kunit.h b/mm/damon/tests/vaddr-kunit.h index fcdccb614fd85..b9fe3bc8472ba 100644 --- a/mm/damon/tests/vaddr-kunit.h +++ b/mm/damon/tests/vaddr-kunit.h @@ -300,6 +300,7 @@ static void damon_test_split_evenly(struct kunit *test) damon_test_split_evenly_fail(test, 0, 100, 0); damon_test_split_evenly_succ(test, 0, 100, 10); damon_test_split_evenly_succ(test, 5, 59, 5); + damon_test_split_evenly_succ(test, 4, 6, 1); damon_test_split_evenly_succ(test, 0, 3, 2); damon_test_split_evenly_fail(test, 5, 6, 2); } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 86f612fbf8866..b9eaa20b73b9a 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -72,6 +72,9 @@ static int damon_va_evenly_split_region(struct damon_target *t, if (!r || !nr_pieces) return -EINVAL; + if (nr_pieces == 1) + return 0; + orig_end = r->ar.end; sz_orig = damon_sz_region(r); sz_piece = ALIGN_DOWN(sz_orig / nr_pieces, DAMON_MIN_REGION); From 729881ffd390797077cec0e573d33b4d724d70b3 Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Sat, 26 Oct 2024 21:51:52 +0800 Subject: [PATCH 113/215] mm: shmem: fallback to page size splice if large folio has poisoned pages The tmpfs has already supported the PMD-sized large folios, and splice() can not read any pages if the large folio has a poisoned page, which is not good as Matthew pointed out in a previous email[1]: "so if we have hwpoison set on one page in a folio, we now can't read bytes from any page in the folio? That seems like we've made a bad situation worse." Thus add a fallback to the PAGE_SIZE splice() still allows reading normal pages if the large folio has hwpoisoned pages. [1] https://lore.kernel.org/all/Zw_d0EVAJkpNJEbA@casper.infradead.org/ [baolin.wang@linux.alibaba.com: code layout cleaup, per dhowells] Link: https://lkml.kernel.org/r/32dd938c-3531-49f7-93e4-b7ff21fec569@linux.alibaba.com Link: https://lkml.kernel.org/r/e3737fbd5366c4de4337bf5f2044817e77a5235b.1729915173.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Cc: David Hildenbrand Cc: David Howells Cc: Hugh Dickins Cc: Kefeng Wang Cc: Matthew Wilcox (Oracle) Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/shmem.c | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 06da05f984dab..5afc5b1f7ae14 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3288,11 +3288,16 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, len = min_t(size_t, len, npages * PAGE_SIZE); do { + bool fallback_page_splice = false; + struct page *page = NULL; + pgoff_t index; + size_t size; + if (*ppos >= i_size_read(inode)) break; - error = shmem_get_folio(inode, *ppos / PAGE_SIZE, 0, &folio, - SGP_READ); + index = *ppos >> PAGE_SHIFT; + error = shmem_get_folio(inode, index, 0, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; @@ -3301,12 +3306,15 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, if (folio) { folio_unlock(folio); - if (folio_test_hwpoison(folio) || - (folio_test_large(folio) && - folio_test_has_hwpoisoned(folio))) { + page = folio_file_page(folio, index); + if (PageHWPoison(page)) { error = -EIO; break; } + + if (folio_test_large(folio) && + folio_test_has_hwpoisoned(folio)) + fallback_page_splice = true; } /* @@ -3320,7 +3328,17 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, isize = i_size_read(inode); if (unlikely(*ppos >= isize)) break; - part = min_t(loff_t, isize - *ppos, len); + /* + * Fallback to PAGE_SIZE splice if the large folio has hwpoisoned + * pages. + */ + size = len; + if (unlikely(fallback_page_splice)) { + size_t offset = *ppos & ~PAGE_MASK; + + size = umin(size, PAGE_SIZE - offset); + } + part = min_t(loff_t, isize - *ppos, size); if (folio) { /* @@ -3328,8 +3346,12 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, * virtual addresses, take care about potential aliasing * before reading the page on the kernel side. */ - if (mapping_writably_mapped(mapping)) - flush_dcache_folio(folio); + if (mapping_writably_mapped(mapping)) { + if (likely(!fallback_page_splice)) + flush_dcache_folio(folio); + else + flush_dcache_page(page); + } folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so we can From aa6b4fdf59406b67e308cfb186456a176cdc0088 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:22:58 -0700 Subject: [PATCH 114/215] memcg-v1: fully deprecate move_charge_at_immigrate Patch series "memcg-v1: fully deprecate charge moving". The memcg v1's charge moving feature has been deprecated for almost 2 years and the kernel warns if someone try to use it. This warning has been backported to all stable kernel and there have not been any report of the warning or the request to support this feature anymore. Let's proceed to fully deprecate this feature. This patch (of 6): Proceed with the complete deprecation of memcg v1's charge moving feature. The deprecation warning has been in the kernel for almost two years and has been ported to all stable kernel since. Now is the time to fully deprecate this feature. Link: https://lkml.kernel.org/r/20241025012304.2473312-1-shakeel.butt@linux.dev Link: https://lkml.kernel.org/r/20241025012304.2473312-2-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- .../admin-guide/cgroup-v1/memory.rst | 82 +------------------ mm/memcontrol-v1.c | 14 +--- 2 files changed, 5 insertions(+), 91 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 270501db9f4e8..286d16fc22ebb 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -90,9 +90,7 @@ Brief summary of control files. used. memory.swappiness set/show swappiness parameter of vmscan (See sysctl's vm.swappiness) - memory.move_charge_at_immigrate set/show controls of moving charges - This knob is deprecated and shouldn't be - used. + memory.move_charge_at_immigrate This knob is deprecated. memory.oom_control set/show oom controls. This knob is deprecated and shouldn't be used. @@ -243,10 +241,6 @@ behind this approach is that a cgroup that aggressively uses a shared page will eventually get charged for it (once it is uncharged from the cgroup that brought it in -- this will happen on memory pressure). -But see :ref:`section 8.2 ` when moving a -task to another cgroup, its pages may be recharged to the new cgroup, if -move_charge_at_immigrate has been chosen. - 2.4 Swap Extension -------------------------------------- @@ -756,78 +750,8 @@ If we want to change this to 1G, we can at any time use:: THIS IS DEPRECATED! -It's expensive and unreliable! It's better practice to launch workload -tasks directly from inside their target cgroup. Use dedicated workload -cgroups to allow fine-grained policy adjustments without having to -move physical pages between control domains. - -Users can move charges associated with a task along with task migration, that -is, uncharge task's pages from the old cgroup and charge them to the new cgroup. -This feature is not supported in !CONFIG_MMU environments because of lack of -page tables. - -8.1 Interface -------------- - -This feature is disabled by default. It can be enabled (and disabled again) by -writing to memory.move_charge_at_immigrate of the destination cgroup. - -If you want to enable it:: - - # echo (some positive value) > memory.move_charge_at_immigrate - -.. note:: - Each bits of move_charge_at_immigrate has its own meaning about what type - of charges should be moved. See :ref:`section 8.2 - ` for details. - -.. note:: - Charges are moved only when you move mm->owner, in other words, - a leader of a thread group. - -.. note:: - If we cannot find enough space for the task in the destination cgroup, we - try to make space by reclaiming memory. Task migration may fail if we - cannot make enough space. - -.. note:: - It can take several seconds if you move charges much. - -And if you want disable it again:: - - # echo 0 > memory.move_charge_at_immigrate - -.. _cgroup-v1-memory-movable-charges: - -8.2 Type of charges which can be moved --------------------------------------- - -Each bit in move_charge_at_immigrate has its own meaning about what type of -charges should be moved. But in any case, it must be noted that an account of -a page or a swap can be moved only when it is charged to the task's current -(old) memory cgroup. - -+---+--------------------------------------------------------------------------+ -|bit| what type of charges would be moved ? | -+===+==========================================================================+ -| 0 | A charge of an anonymous page (or swap of it) used by the target task. | -| | You must enable Swap Extension (see 2.4) to enable move of swap charges. | -+---+--------------------------------------------------------------------------+ -| 1 | A charge of file pages (normal file, tmpfs file (e.g. ipc shared memory) | -| | and swaps of tmpfs file) mmapped by the target task. Unlike the case of | -| | anonymous pages, file pages (and swaps) in the range mmapped by the task | -| | will be moved even if the task hasn't done page fault, i.e. they might | -| | not be the task's "RSS", but other task's "RSS" that maps the same file. | -| | The mapcount of the page is ignored (the page can be moved independent | -| | of the mapcount). You must enable Swap Extension (see 2.4) to | -| | enable move of swap charges. | -+---+--------------------------------------------------------------------------+ - -8.3 TODO --------- - -- All of moving charge operations are done under cgroup_mutex. It's not good - behavior to hold the mutex too long, so we may need some trick. +Reading memory.move_charge_at_immigrate will always return 0 and writing +to it will always return -EINVAL. 9. Memory thresholds ==================== diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index f8744f5630bb9..ef7e2a0ec66a3 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -593,29 +593,19 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { - return mem_cgroup_from_css(css)->move_charge_at_immigrate; + return 0; } #ifdef CONFIG_MMU static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - pr_warn_once("Cgroup memory moving (move_charge_at_immigrate) is deprecated. " "Please report your usecase to linux-mm@kvack.org if you " "depend on this functionality.\n"); - if (val & ~MOVE_MASK) + if (val != 0) return -EINVAL; - - /* - * No kind of locking is needed in here, because ->can_attach() will - * check this value once in the beginning of the process, and then carry - * on with stale data. This means that changes to this value will only - * affect task migrations starting after the change. - */ - memcg->move_charge_at_immigrate = val; return 0; } #else From 6b611388b626eaa59d202bf8f64d095ff80bcde6 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:22:59 -0700 Subject: [PATCH 115/215] memcg-v1: remove charge move code The memcg-v1 charge move feature has been deprecated completely and let's remove the relevant code as well. Link: https://lkml.kernel.org/r/20241025012304.2473312-3-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 5 - mm/memcontrol-v1.c | 887 ------------------------------------- mm/memcontrol-v1.h | 6 - mm/memcontrol.c | 9 - 4 files changed, 907 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 34d2da05f2f15..0b113267b2de0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,11 +299,6 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* - * Should we move charges of a task when a task is moved into this - * mem_cgroup ? And what type of charges should we move ? - */ - unsigned long move_charge_at_immigrate; /* taken only while moving_account > 0 */ spinlock_t move_lock; unsigned long move_lock_flags; diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index ef7e2a0ec66a3..9c0fba8c8a833 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -40,31 +40,6 @@ static struct mem_cgroup_tree soft_limit_tree __read_mostly; #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 -/* Stuffs for move charges at task migration. */ -/* - * Types of charges to be moved. - */ -#define MOVE_ANON 0x1ULL -#define MOVE_FILE 0x2ULL -#define MOVE_MASK (MOVE_ANON | MOVE_FILE) - -/* "mc" and its members are protected by cgroup_mutex */ -static struct move_charge_struct { - spinlock_t lock; /* for from, to */ - struct mm_struct *mm; - struct mem_cgroup *from; - struct mem_cgroup *to; - unsigned long flags; - unsigned long precharge; - unsigned long moved_charge; - unsigned long moved_swap; - struct task_struct *moving_task; /* a task moving charges */ - wait_queue_head_t waitq; /* a waitq for other context */ -} mc = { - .lock = __SPIN_LOCK_UNLOCKED(mc.lock), - .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), -}; - /* for OOM */ struct mem_cgroup_eventfd_list { struct list_head list; @@ -426,51 +401,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, return nr_reclaimed; } -/* - * A routine for checking "mem" is under move_account() or not. - * - * Checking a cgroup is mc.from or mc.to or under hierarchy of - * moving cgroups. This is for waiting at high-memory pressure - * caused by "move". - */ -static bool mem_cgroup_under_move(struct mem_cgroup *memcg) -{ - struct mem_cgroup *from; - struct mem_cgroup *to; - bool ret = false; - /* - * Unlike task_move routines, we access mc.to, mc.from not under - * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. - */ - spin_lock(&mc.lock); - from = mc.from; - to = mc.to; - if (!from) - goto unlock; - - ret = mem_cgroup_is_descendant(from, memcg) || - mem_cgroup_is_descendant(to, memcg); -unlock: - spin_unlock(&mc.lock); - return ret; -} - -bool memcg1_wait_acct_move(struct mem_cgroup *memcg) -{ - if (mc.moving_task && current != mc.moving_task) { - if (mem_cgroup_under_move(memcg)) { - DEFINE_WAIT(wait); - prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); - /* moving charge context might have finished. */ - if (mc.moving_task) - schedule(); - finish_wait(&mc.waitq, &wait); - return true; - } - } - return false; -} - /** * folio_memcg_lock - Bind a folio to its memcg. * @folio: The folio. @@ -552,44 +482,6 @@ void folio_memcg_unlock(struct folio *folio) __folio_memcg_unlock(folio_memcg(folio)); } -#ifdef CONFIG_SWAP -/** - * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. - * @entry: swap entry to be moved - * @from: mem_cgroup which the entry is moved from - * @to: mem_cgroup which the entry is moved to - * - * It succeeds only when the swap_cgroup's record for this entry is the same - * as the mem_cgroup's id of @from. - * - * Returns 0 on success, -EINVAL on failure. - * - * The caller must have charged to @to, IOW, called page_counter_charge() about - * both res and memsw, and called css_get(). - */ -static int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) -{ - unsigned short old_id, new_id; - - old_id = mem_cgroup_id(from); - new_id = mem_cgroup_id(to); - - if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mod_memcg_state(from, MEMCG_SWAP, -1); - mod_memcg_state(to, MEMCG_SWAP, 1); - return 0; - } - return -EINVAL; -} -#else -static inline int mem_cgroup_move_swap_account(swp_entry_t entry, - struct mem_cgroup *from, struct mem_cgroup *to) -{ - return -EINVAL; -} -#endif - static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -616,785 +508,6 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, } #endif -#ifdef CONFIG_MMU -/* Handlers for move charge at task migration. */ -static int mem_cgroup_do_precharge(unsigned long count) -{ - int ret; - - /* Try a single bulk charge without reclaim first, kswapd may wake */ - ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count); - if (!ret) { - mc.precharge += count; - return ret; - } - - /* Try charges one by one with reclaim, but do not retry */ - while (count--) { - ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1); - if (ret) - return ret; - mc.precharge++; - cond_resched(); - } - return 0; -} - -union mc_target { - struct folio *folio; - swp_entry_t ent; -}; - -enum mc_target_type { - MC_TARGET_NONE = 0, - MC_TARGET_PAGE, - MC_TARGET_SWAP, - MC_TARGET_DEVICE, -}; - -static struct page *mc_handle_present_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent) -{ - struct page *page = vm_normal_page(vma, addr, ptent); - - if (!page) - return NULL; - if (PageAnon(page)) { - if (!(mc.flags & MOVE_ANON)) - return NULL; - } else { - if (!(mc.flags & MOVE_FILE)) - return NULL; - } - get_page(page); - - return page; -} - -#if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE) -static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - pte_t ptent, swp_entry_t *entry) -{ - struct page *page = NULL; - swp_entry_t ent = pte_to_swp_entry(ptent); - - if (!(mc.flags & MOVE_ANON)) - return NULL; - - /* - * Handle device private pages that are not accessible by the CPU, but - * stored as special swap entries in the page table. - */ - if (is_device_private_entry(ent)) { - page = pfn_swap_entry_to_page(ent); - if (!get_page_unless_zero(page)) - return NULL; - return page; - } - - if (non_swap_entry(ent)) - return NULL; - - /* - * Because swap_cache_get_folio() updates some statistics counter, - * we call find_get_page() with swapper_space directly. - */ - page = find_get_page(swap_address_space(ent), swap_cache_index(ent)); - entry->val = ent.val; - - return page; -} -#else -static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, - pte_t ptent, swp_entry_t *entry) -{ - return NULL; -} -#endif - -static struct page *mc_handle_file_pte(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent) -{ - unsigned long index; - struct folio *folio; - - if (!vma->vm_file) /* anonymous vma */ - return NULL; - if (!(mc.flags & MOVE_FILE)) - return NULL; - - /* folio is moved even if it's not RSS of this task(page-faulted). */ - /* shmem/tmpfs may report page out on swap: account for that too. */ - index = linear_page_index(vma, addr); - folio = filemap_get_incore_folio(vma->vm_file->f_mapping, index); - if (IS_ERR(folio)) - return NULL; - return folio_file_page(folio, index); -} - -static void memcg1_check_events(struct mem_cgroup *memcg, int nid); -static void memcg1_charge_statistics(struct mem_cgroup *memcg, int nr_pages); - -/** - * mem_cgroup_move_account - move account of the folio - * @folio: The folio. - * @compound: charge the page as compound or small page - * @from: mem_cgroup which the folio is moved from. - * @to: mem_cgroup which the folio is moved to. @from != @to. - * - * The folio must be locked and not on the LRU. - * - * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" - * from old cgroup. - */ -static int mem_cgroup_move_account(struct folio *folio, - bool compound, - struct mem_cgroup *from, - struct mem_cgroup *to) -{ - struct lruvec *from_vec, *to_vec; - struct pglist_data *pgdat; - unsigned int nr_pages = compound ? folio_nr_pages(folio) : 1; - int nid, ret; - - VM_BUG_ON(from == to); - VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); - VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); - VM_BUG_ON(compound && !folio_test_large(folio)); - - ret = -EINVAL; - if (folio_memcg(folio) != from) - goto out; - - pgdat = folio_pgdat(folio); - from_vec = mem_cgroup_lruvec(from, pgdat); - to_vec = mem_cgroup_lruvec(to, pgdat); - - folio_memcg_lock(folio); - - if (folio_test_anon(folio)) { - if (folio_mapped(folio)) { - __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages); - if (folio_test_pmd_mappable(folio)) { - __mod_lruvec_state(from_vec, NR_ANON_THPS, - -nr_pages); - __mod_lruvec_state(to_vec, NR_ANON_THPS, - nr_pages); - } - } - } else { - __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages); - - if (folio_test_swapbacked(folio)) { - __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages); - __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages); - } - - if (folio_mapped(folio)) { - __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages); - } - - if (folio_test_dirty(folio)) { - struct address_space *mapping = folio_mapping(folio); - - if (mapping_can_writeback(mapping)) { - __mod_lruvec_state(from_vec, NR_FILE_DIRTY, - -nr_pages); - __mod_lruvec_state(to_vec, NR_FILE_DIRTY, - nr_pages); - } - } - } - -#ifdef CONFIG_SWAP - if (folio_test_swapcache(folio)) { - __mod_lruvec_state(from_vec, NR_SWAPCACHE, -nr_pages); - __mod_lruvec_state(to_vec, NR_SWAPCACHE, nr_pages); - } -#endif - if (folio_test_writeback(folio)) { - __mod_lruvec_state(from_vec, NR_WRITEBACK, -nr_pages); - __mod_lruvec_state(to_vec, NR_WRITEBACK, nr_pages); - } - - /* - * All state has been migrated, let's switch to the new memcg. - * - * It is safe to change page's memcg here because the page - * is referenced, charged, isolated, and locked: we can't race - * with (un)charging, migration, LRU putback, or anything else - * that would rely on a stable page's memory cgroup. - * - * Note that folio_memcg_lock is a memcg lock, not a page lock, - * to save space. As soon as we switch page's memory cgroup to a - * new memcg that isn't locked, the above state can change - * concurrently again. Make sure we're truly done with it. - */ - smp_mb(); - - css_get(&to->css); - css_put(&from->css); - - /* Warning should never happen, so don't worry about refcount non-0 */ - WARN_ON_ONCE(folio_unqueue_deferred_split(folio)); - folio->memcg_data = (unsigned long)to; - - __folio_memcg_unlock(from); - - ret = 0; - nid = folio_nid(folio); - - local_irq_disable(); - memcg1_charge_statistics(to, nr_pages); - memcg1_check_events(to, nid); - memcg1_charge_statistics(from, -nr_pages); - memcg1_check_events(from, nid); - local_irq_enable(); -out: - return ret; -} - -/** - * get_mctgt_type - get target type of moving charge - * @vma: the vma the pte to be checked belongs - * @addr: the address corresponding to the pte to be checked - * @ptent: the pte to be checked - * @target: the pointer the target page or swap ent will be stored(can be NULL) - * - * Context: Called with pte lock held. - * Return: - * * MC_TARGET_NONE - If the pte is not a target for move charge. - * * MC_TARGET_PAGE - If the page corresponding to this pte is a target for - * move charge. If @target is not NULL, the folio is stored in target->folio - * with extra refcnt taken (Caller should release it). - * * MC_TARGET_SWAP - If the swap entry corresponding to this pte is a - * target for charge migration. If @target is not NULL, the entry is - * stored in target->ent. - * * MC_TARGET_DEVICE - Like MC_TARGET_PAGE but page is device memory and - * thus not on the lru. For now such page is charged like a regular page - * would be as it is just special memory taking the place of a regular page. - * See Documentations/vm/hmm.txt and include/linux/hmm.h - */ -static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, - unsigned long addr, pte_t ptent, union mc_target *target) -{ - struct page *page = NULL; - struct folio *folio; - enum mc_target_type ret = MC_TARGET_NONE; - swp_entry_t ent = { .val = 0 }; - - if (pte_present(ptent)) - page = mc_handle_present_pte(vma, addr, ptent); - else if (pte_none_mostly(ptent)) - /* - * PTE markers should be treated as a none pte here, separated - * from other swap handling below. - */ - page = mc_handle_file_pte(vma, addr, ptent); - else if (is_swap_pte(ptent)) - page = mc_handle_swap_pte(vma, ptent, &ent); - - if (page) - folio = page_folio(page); - if (target && page) { - if (!folio_trylock(folio)) { - folio_put(folio); - return ret; - } - /* - * page_mapped() must be stable during the move. This - * pte is locked, so if it's present, the page cannot - * become unmapped. If it isn't, we have only partial - * control over the mapped state: the page lock will - * prevent new faults against pagecache and swapcache, - * so an unmapped page cannot become mapped. However, - * if the page is already mapped elsewhere, it can - * unmap, and there is nothing we can do about it. - * Alas, skip moving the page in this case. - */ - if (!pte_present(ptent) && page_mapped(page)) { - folio_unlock(folio); - folio_put(folio); - return ret; - } - } - - if (!page && !ent.val) - return ret; - if (page) { - /* - * Do only loose check w/o serialization. - * mem_cgroup_move_account() checks the page is valid or - * not under LRU exclusion. - */ - if (folio_memcg(folio) == mc.from) { - ret = MC_TARGET_PAGE; - if (folio_is_device_private(folio) || - folio_is_device_coherent(folio)) - ret = MC_TARGET_DEVICE; - if (target) - target->folio = folio; - } - if (!ret || !target) { - if (target) - folio_unlock(folio); - folio_put(folio); - } - } - /* - * There is a swap entry and a page doesn't exist or isn't charged. - * But we cannot move a tail-page in a THP. - */ - if (ent.val && !ret && (!page || !PageTransCompound(page)) && - mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { - ret = MC_TARGET_SWAP; - if (target) - target->ent = ent; - } - return ret; -} - -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -/* - * We don't consider PMD mapped swapping or file mapped pages because THP does - * not support them for now. - * Caller should make sure that pmd_trans_huge(pmd) is true. - */ -static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, union mc_target *target) -{ - struct page *page = NULL; - struct folio *folio; - enum mc_target_type ret = MC_TARGET_NONE; - - if (unlikely(is_swap_pmd(pmd))) { - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmd)); - return ret; - } - page = pmd_page(pmd); - VM_BUG_ON_PAGE(!page || !PageHead(page), page); - folio = page_folio(page); - if (!(mc.flags & MOVE_ANON)) - return ret; - if (folio_memcg(folio) == mc.from) { - ret = MC_TARGET_PAGE; - if (target) { - folio_get(folio); - if (!folio_trylock(folio)) { - folio_put(folio); - return MC_TARGET_NONE; - } - target->folio = folio; - } - } - return ret; -} -#else -static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma, - unsigned long addr, pmd_t pmd, union mc_target *target) -{ - return MC_TARGET_NONE; -} -#endif - -static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, - unsigned long addr, unsigned long end, - struct mm_walk *walk) -{ - struct vm_area_struct *vma = walk->vma; - pte_t *pte; - spinlock_t *ptl; - - ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) { - /* - * Note their can not be MC_TARGET_DEVICE for now as we do not - * support transparent huge page with MEMORY_DEVICE_PRIVATE but - * this might change. - */ - if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) - mc.precharge += HPAGE_PMD_NR; - spin_unlock(ptl); - return 0; - } - - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; - for (; addr != end; pte++, addr += PAGE_SIZE) - if (get_mctgt_type(vma, addr, ptep_get(pte), NULL)) - mc.precharge++; /* increment precharge temporarily */ - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - - return 0; -} - -static const struct mm_walk_ops precharge_walk_ops = { - .pmd_entry = mem_cgroup_count_precharge_pte_range, - .walk_lock = PGWALK_RDLOCK, -}; - -static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) -{ - unsigned long precharge; - - mmap_read_lock(mm); - walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); - mmap_read_unlock(mm); - - precharge = mc.precharge; - mc.precharge = 0; - - return precharge; -} - -static int mem_cgroup_precharge_mc(struct mm_struct *mm) -{ - unsigned long precharge = mem_cgroup_count_precharge(mm); - - VM_BUG_ON(mc.moving_task); - mc.moving_task = current; - return mem_cgroup_do_precharge(precharge); -} - -/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ -static void __mem_cgroup_clear_mc(void) -{ - struct mem_cgroup *from = mc.from; - struct mem_cgroup *to = mc.to; - - /* we must uncharge all the leftover precharges from mc.to */ - if (mc.precharge) { - mem_cgroup_cancel_charge(mc.to, mc.precharge); - mc.precharge = 0; - } - /* - * we didn't uncharge from mc.from at mem_cgroup_move_account(), so - * we must uncharge here. - */ - if (mc.moved_charge) { - mem_cgroup_cancel_charge(mc.from, mc.moved_charge); - mc.moved_charge = 0; - } - /* we must fixup refcnts and charges */ - if (mc.moved_swap) { - /* uncharge swap account from the old cgroup */ - if (!mem_cgroup_is_root(mc.from)) - page_counter_uncharge(&mc.from->memsw, mc.moved_swap); - - mem_cgroup_id_put_many(mc.from, mc.moved_swap); - - /* - * we charged both to->memory and to->memsw, so we - * should uncharge to->memory. - */ - if (!mem_cgroup_is_root(mc.to)) - page_counter_uncharge(&mc.to->memory, mc.moved_swap); - - mc.moved_swap = 0; - } - memcg1_oom_recover(from); - memcg1_oom_recover(to); - wake_up_all(&mc.waitq); -} - -static void mem_cgroup_clear_mc(void) -{ - struct mm_struct *mm = mc.mm; - - /* - * we must clear moving_task before waking up waiters at the end of - * task migration. - */ - mc.moving_task = NULL; - __mem_cgroup_clear_mc(); - spin_lock(&mc.lock); - mc.from = NULL; - mc.to = NULL; - mc.mm = NULL; - spin_unlock(&mc.lock); - - mmput(mm); -} - -int memcg1_can_attach(struct cgroup_taskset *tset) -{ - struct cgroup_subsys_state *css; - struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */ - struct mem_cgroup *from; - struct task_struct *leader, *p; - struct mm_struct *mm; - unsigned long move_flags; - int ret = 0; - - /* charge immigration isn't supported on the default hierarchy */ - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - return 0; - - /* - * Multi-process migrations only happen on the default hierarchy - * where charge immigration is not used. Perform charge - * immigration if @tset contains a leader and whine if there are - * multiple. - */ - p = NULL; - cgroup_taskset_for_each_leader(leader, css, tset) { - WARN_ON_ONCE(p); - p = leader; - memcg = mem_cgroup_from_css(css); - } - if (!p) - return 0; - - /* - * We are now committed to this value whatever it is. Changes in this - * tunable will only affect upcoming migrations, not the current one. - * So we need to save it, and keep it going. - */ - move_flags = READ_ONCE(memcg->move_charge_at_immigrate); - if (!move_flags) - return 0; - - from = mem_cgroup_from_task(p); - - VM_BUG_ON(from == memcg); - - mm = get_task_mm(p); - if (!mm) - return 0; - /* We move charges only when we move a owner of the mm */ - if (mm->owner == p) { - VM_BUG_ON(mc.from); - VM_BUG_ON(mc.to); - VM_BUG_ON(mc.precharge); - VM_BUG_ON(mc.moved_charge); - VM_BUG_ON(mc.moved_swap); - - spin_lock(&mc.lock); - mc.mm = mm; - mc.from = from; - mc.to = memcg; - mc.flags = move_flags; - spin_unlock(&mc.lock); - /* We set mc.moving_task later */ - - ret = mem_cgroup_precharge_mc(mm); - if (ret) - mem_cgroup_clear_mc(); - } else { - mmput(mm); - } - return ret; -} - -void memcg1_cancel_attach(struct cgroup_taskset *tset) -{ - if (mc.to) - mem_cgroup_clear_mc(); -} - -static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, - unsigned long addr, unsigned long end, - struct mm_walk *walk) -{ - int ret = 0; - struct vm_area_struct *vma = walk->vma; - pte_t *pte; - spinlock_t *ptl; - enum mc_target_type target_type; - union mc_target target; - struct folio *folio; - bool tried_split_before = false; - -retry_pmd: - ptl = pmd_trans_huge_lock(pmd, vma); - if (ptl) { - if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(ptl); - return 0; - } - target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); - if (target_type == MC_TARGET_PAGE) { - folio = target.folio; - /* - * Deferred split queue locking depends on memcg, - * and unqueue is unsafe unless folio refcount is 0: - * split or skip if on the queue? first try to split. - */ - if (!list_empty(&folio->_deferred_list)) { - spin_unlock(ptl); - if (!tried_split_before) - split_folio(folio); - folio_unlock(folio); - folio_put(folio); - if (tried_split_before) - return 0; - tried_split_before = true; - goto retry_pmd; - } - /* - * So long as that pmd lock is held, the folio cannot - * be racily added to the _deferred_list, because - * __folio_remove_rmap() will find !partially_mapped. - */ - if (folio_isolate_lru(folio)) { - if (!mem_cgroup_move_account(folio, true, - mc.from, mc.to)) { - mc.precharge -= HPAGE_PMD_NR; - mc.moved_charge += HPAGE_PMD_NR; - } - folio_putback_lru(folio); - } - folio_unlock(folio); - folio_put(folio); - } else if (target_type == MC_TARGET_DEVICE) { - folio = target.folio; - if (!mem_cgroup_move_account(folio, true, - mc.from, mc.to)) { - mc.precharge -= HPAGE_PMD_NR; - mc.moved_charge += HPAGE_PMD_NR; - } - folio_unlock(folio); - folio_put(folio); - } - spin_unlock(ptl); - return 0; - } - -retry: - pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (!pte) - return 0; - for (; addr != end; addr += PAGE_SIZE) { - pte_t ptent = ptep_get(pte++); - bool device = false; - swp_entry_t ent; - - if (!mc.precharge) - break; - - switch (get_mctgt_type(vma, addr, ptent, &target)) { - case MC_TARGET_DEVICE: - device = true; - fallthrough; - case MC_TARGET_PAGE: - folio = target.folio; - /* - * We can have a part of the split pmd here. Moving it - * can be done but it would be too convoluted so simply - * ignore such a partial THP and keep it in original - * memcg. There should be somebody mapping the head. - */ - if (folio_test_large(folio)) - goto put; - if (!device && !folio_isolate_lru(folio)) - goto put; - if (!mem_cgroup_move_account(folio, false, - mc.from, mc.to)) { - mc.precharge--; - /* we uncharge from mc.from later. */ - mc.moved_charge++; - } - if (!device) - folio_putback_lru(folio); -put: /* get_mctgt_type() gets & locks the page */ - folio_unlock(folio); - folio_put(folio); - break; - case MC_TARGET_SWAP: - ent = target.ent; - if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) { - mc.precharge--; - mem_cgroup_id_get_many(mc.to, 1); - /* we fixup other refcnts and charges later. */ - mc.moved_swap++; - } - break; - default: - break; - } - } - pte_unmap_unlock(pte - 1, ptl); - cond_resched(); - - if (addr != end) { - /* - * We have consumed all precharges we got in can_attach(). - * We try charge one by one, but don't do any additional - * charges to mc.to if we have failed in charge once in attach() - * phase. - */ - ret = mem_cgroup_do_precharge(1); - if (!ret) - goto retry; - } - - return ret; -} - -static const struct mm_walk_ops charge_walk_ops = { - .pmd_entry = mem_cgroup_move_charge_pte_range, - .walk_lock = PGWALK_RDLOCK, -}; - -static void mem_cgroup_move_charge(void) -{ - lru_add_drain_all(); - /* - * Signal folio_memcg_lock() to take the memcg's move_lock - * while we're moving its pages to another memcg. Then wait - * for already started RCU-only updates to finish. - */ - atomic_inc(&mc.from->moving_account); - synchronize_rcu(); -retry: - if (unlikely(!mmap_read_trylock(mc.mm))) { - /* - * Someone who are holding the mmap_lock might be waiting in - * waitq. So we cancel all extra charges, wake up all waiters, - * and retry. Because we cancel precharges, we might not be able - * to move enough charges, but moving charge is a best-effort - * feature anyway, so it wouldn't be a big problem. - */ - __mem_cgroup_clear_mc(); - cond_resched(); - goto retry; - } - /* - * When we have consumed all precharges and failed in doing - * additional charge, the page walk just aborts. - */ - walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); - mmap_read_unlock(mc.mm); - atomic_dec(&mc.from->moving_account); -} - -void memcg1_move_task(void) -{ - if (mc.to) { - mem_cgroup_move_charge(); - mem_cgroup_clear_mc(); - } -} - -#else /* !CONFIG_MMU */ -int memcg1_can_attach(struct cgroup_taskset *tset) -{ - return 0; -} -void memcg1_cancel_attach(struct cgroup_taskset *tset) -{ -} -void memcg1_move_task(void) -{ -} -#endif - static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) { struct mem_cgroup_threshold_ary *t; diff --git a/mm/memcontrol-v1.h b/mm/memcontrol-v1.h index c0672e25bcdb2..0e3b82951d915 100644 --- a/mm/memcontrol-v1.h +++ b/mm/memcontrol-v1.h @@ -80,12 +80,7 @@ static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) WRITE_ONCE(memcg->soft_limit, PAGE_COUNTER_MAX); } -bool memcg1_wait_acct_move(struct mem_cgroup *memcg); - struct cgroup_taskset; -int memcg1_can_attach(struct cgroup_taskset *tset); -void memcg1_cancel_attach(struct cgroup_taskset *tset); -void memcg1_move_task(void); void memcg1_css_offline(struct mem_cgroup *memcg); /* for encoding cft->private value on file */ @@ -130,7 +125,6 @@ static inline void memcg1_free_events(struct mem_cgroup *memcg) {} static inline void memcg1_memcg_init(struct mem_cgroup *memcg) {} static inline void memcg1_remove_from_trees(struct mem_cgroup *memcg) {} static inline void memcg1_soft_limit_reset(struct mem_cgroup *memcg) {} -static inline bool memcg1_wait_acct_move(struct mem_cgroup *memcg) { return false; } static inline void memcg1_css_offline(struct mem_cgroup *memcg) {} static inline bool memcg1_oom_prepare(struct mem_cgroup *memcg, bool *locked) { return true; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5fcdd25fc1342..ec4ac6eb650bc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2242,12 +2242,6 @@ int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, */ if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER)) goto retry; - /* - * At task move, charge accounts can be doubly counted. So, it's - * better to wait until the end of task_move if something is going on. - */ - if (memcg1_wait_acct_move(mem_over_limit)) - goto retry; if (nr_retries--) goto retry; @@ -4441,9 +4435,6 @@ struct cgroup_subsys memory_cgrp_subsys = { .exit = mem_cgroup_exit, .dfl_cftypes = memory_files, #ifdef CONFIG_MEMCG_V1 - .can_attach = memcg1_can_attach, - .cancel_attach = memcg1_cancel_attach, - .post_attach = memcg1_move_task, .legacy_cftypes = mem_cgroup_legacy_files, #endif .early_init = 0, From a8cd9d4ce35eaeb603c3ae7633bb120de5970b3c Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:23:00 -0700 Subject: [PATCH 116/215] memcg-v1: no need for memcg locking for dirty tracking During the era of memcg charge migration, the kernel has to be make sure that the dirty stat updates do not race with the charge migration. Otherwise it might update the dirty stats of the wrong memcg. Now with the memcg charge migration gone, there is no more race for dirty stat updates and the previous locking can be removed. Link: https://lkml.kernel.org/r/20241025012304.2473312-4-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- fs/buffer.c | 5 ----- mm/page-writeback.c | 16 +++------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index 1fc9a50def0b5..88e765b0699fe 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -736,15 +736,12 @@ bool block_dirty_folio(struct address_space *mapping, struct folio *folio) * Lock out page's memcg migration to keep PageDirty * synchronized with per-memcg dirty page counters. */ - folio_memcg_lock(folio); newly_dirty = !folio_test_set_dirty(folio); spin_unlock(&mapping->i_private_lock); if (newly_dirty) __folio_mark_dirty(folio, mapping, 1); - folio_memcg_unlock(folio); - if (newly_dirty) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); @@ -1194,13 +1191,11 @@ void mark_buffer_dirty(struct buffer_head *bh) struct folio *folio = bh->b_folio; struct address_space *mapping = NULL; - folio_memcg_lock(folio); if (!folio_test_set_dirty(folio)) { mapping = folio->mapping; if (mapping) __folio_mark_dirty(folio, mapping, 0); } - folio_memcg_unlock(folio); if (mapping) __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1d7179aba8e3e..a76a73529fd9c 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2743,8 +2743,6 @@ EXPORT_SYMBOL(noop_dirty_folio); /* * Helper function for set_page_dirty family. * - * Caller must hold folio_memcg_lock(). - * * NOTE: This relies on being atomic wrt interrupts. */ static void folio_account_dirtied(struct folio *folio, @@ -2777,7 +2775,6 @@ static void folio_account_dirtied(struct folio *folio, /* * Helper function for deaccounting dirty page without writeback. * - * Caller must hold folio_memcg_lock(). */ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) { @@ -2795,9 +2792,8 @@ void folio_account_cleaned(struct folio *folio, struct bdi_writeback *wb) * If warn is true, then emit a warning if the folio is not uptodate and has * not been truncated. * - * The caller must hold folio_memcg_lock(). It is the caller's - * responsibility to prevent the folio from being truncated while - * this function is in progress, although it may have been truncated + * It is the caller's responsibility to prevent the folio from being truncated + * while this function is in progress, although it may have been truncated * before this function is called. Most callers have the folio locked. * A few have the folio blocked from truncation through other means (e.g. * zap_vma_pages() has it mapped and is holding the page table lock). @@ -2841,14 +2837,10 @@ void __folio_mark_dirty(struct folio *folio, struct address_space *mapping, */ bool filemap_dirty_folio(struct address_space *mapping, struct folio *folio) { - folio_memcg_lock(folio); - if (folio_test_set_dirty(folio)) { - folio_memcg_unlock(folio); + if (folio_test_set_dirty(folio)) return false; - } __folio_mark_dirty(folio, mapping, !folio_test_private(folio)); - folio_memcg_unlock(folio); if (mapping->host) { /* !PageAnon && !swapper_space */ @@ -2975,14 +2967,12 @@ void __folio_cancel_dirty(struct folio *folio) struct bdi_writeback *wb; struct wb_lock_cookie cookie = {}; - folio_memcg_lock(folio); wb = unlocked_inode_to_wb_begin(inode, &cookie); if (folio_test_clear_dirty(folio)) folio_account_cleaned(folio, wb); unlocked_inode_to_wb_end(inode, &cookie); - folio_memcg_unlock(folio); } else { folio_clear_dirty(folio); } From 568bcf4148493a3cf544f88df4e81e862b69f5e9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:23:01 -0700 Subject: [PATCH 117/215] memcg-v1: no need for memcg locking for writeback tracking During the era of memcg charge migration, the kernel has to be make sure that the writeback stat updates do not race with the charge migration. Otherwise it might update the writeback stats of the wrong memcg. Now with the memcg charge migration gone, there is no more race for writeback stat updates and the previous locking can be removed. Link: https://lkml.kernel.org/r/20241025012304.2473312-5-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/page-writeback.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a76a73529fd9c..9c3317c3a6155 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -3083,7 +3083,6 @@ bool __folio_end_writeback(struct folio *folio) struct address_space *mapping = folio_mapping(folio); bool ret; - folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { struct inode *inode = mapping->host; struct backing_dev_info *bdi = inode_to_bdi(inode); @@ -3114,7 +3113,6 @@ bool __folio_end_writeback(struct folio *folio) lruvec_stat_mod_folio(folio, NR_WRITEBACK, -nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, -nr); node_stat_mod_folio(folio, NR_WRITTEN, nr); - folio_memcg_unlock(folio); return ret; } @@ -3127,7 +3125,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) VM_BUG_ON_FOLIO(folio_test_writeback(folio), folio); - folio_memcg_lock(folio); if (mapping && mapping_use_writeback_tags(mapping)) { XA_STATE(xas, &mapping->i_pages, folio_index(folio)); struct inode *inode = mapping->host; @@ -3168,7 +3165,6 @@ void __folio_start_writeback(struct folio *folio, bool keep_write) lruvec_stat_mod_folio(folio, NR_WRITEBACK, nr); zone_stat_mod_folio(folio, NR_ZONE_WRITE_PENDING, nr); - folio_memcg_unlock(folio); access_ret = arch_make_folio_accessible(folio); /* From cf4a65539c136d78d1b3b20e94caeecb616ea9d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:23:02 -0700 Subject: [PATCH 118/215] memcg-v1: no need for memcg locking for MGLRU While updating the generation of the folios, MGLRU requires that the folio's memcg association remains stable. With the charge migration deprecated, there is no need for MGLRU to acquire locks to keep the folio and memcg association stable. [yuzhao@google.com: remove !rcu_read_lock_held() assertion] Link: https://lkml.kernel.org/r/ZykEtcHrQRq-KrBC@google.com Link: https://syzkaller.appspot.com/bug?extid=24f45b8beab9788e467e Link: https://lore.kernel.org/lkml/67294349.050a0220.701a.0010.GAE@google.com/ [akpm@linux-foundation.org: remove now-unused local] [shakeel.butt@linux.dev: folio_rcu() fixup, per Yu Zhao] Link: https://lkml.kernel.org/r/iwmabnye3nl4merealrawt3bdvfii2pwavwrddrqpraoveet7h@ezrsdhjwwej7 Link: https://lkml.kernel.org/r/20241025012304.2473312-6-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Reviewed-by: Roman Gushchin Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/vmscan.c | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 8d1301c0f22a1..caba8e811ec55 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3137,7 +3137,6 @@ static int folio_update_gen(struct folio *folio, int gen) unsigned long new_flags, old_flags = READ_ONCE(folio->flags); VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); - VM_WARN_ON_ONCE(!rcu_read_lock_held()); do { /* lru_gen_del_folio() has isolated this page? */ @@ -3353,7 +3352,7 @@ static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, if (folio_nid(folio) != pgdat->node_id) return NULL; - if (folio_memcg_rcu(folio) != memcg) + if (folio_memcg(folio) != memcg) return NULL; /* file VMAs can contain anon pages from COW */ @@ -3649,10 +3648,8 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) .p4d_entry = walk_pud_range, .walk_lock = PGWALK_RDLOCK, }; - int err; struct lruvec *lruvec = walk->lruvec; - struct mem_cgroup *memcg = lruvec_memcg(lruvec); walk->next_addr = FIRST_USER_ADDRESS; @@ -3665,10 +3662,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) if (walk->seq != max_seq) break; - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - break; - /* the caller might be holding the lock for write */ if (mmap_read_trylock(mm)) { err = walk_page_range(mm, walk->next_addr, ULONG_MAX, &mm_walk_ops, walk); @@ -3676,8 +3669,6 @@ static void walk_mm(struct mm_struct *mm, struct lru_gen_mm_walk *walk) mmap_read_unlock(mm); } - mem_cgroup_unlock_pages(); - if (walk->batched) { spin_lock_irq(&lruvec->lru_lock); reset_batch_size(walk); @@ -4099,10 +4090,6 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } } - /* folio_update_gen() requires stable folio_memcg() */ - if (!mem_cgroup_trylock_pages(memcg)) - return true; - arch_enter_lazy_mmu_mode(); pte -= (addr - start) / PAGE_SIZE; @@ -4147,7 +4134,6 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) } arch_leave_lazy_mmu_mode(); - mem_cgroup_unlock_pages(); /* feedback from rmap walkers to page table walkers */ if (mm_state && suitable_to_scan(i, young)) From a29c0e4b2e867f4e362a6740c430bfdc2efdd1d9 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 24 Oct 2024 18:23:03 -0700 Subject: [PATCH 119/215] memcg-v1: remove memcg move locking code The memcg v1's charge move feature has been deprecated. All the places using the memcg move lock, have stopped using it as they don't need the protection any more. Let's proceed to remove all the locking code related to charge moving. Link: https://lkml.kernel.org/r/20241025012304.2473312-7-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Hugh Dickins Cc: Muchun Song Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 54 ------------------------- mm/filemap.c | 1 - mm/memcontrol-v1.c | 82 -------------------------------------- mm/memcontrol.c | 5 --- mm/rmap.c | 1 - 5 files changed, 143 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0b113267b2de0..bb49e0d4b377a 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -299,20 +299,10 @@ struct mem_cgroup { /* For oom notifier event fd */ struct list_head oom_notify; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; - unsigned long move_lock_flags; - /* Legacy tcp memory accounting */ bool tcpmem_active; int tcpmem_pressure; - /* - * set > 0 if pages under this cgroup are moving to other cgroup. - */ - atomic_t moving_account; - struct task_struct *move_lock_task; - /* List of events which userspace want to receive */ struct list_head event_list; spinlock_t event_list_lock; @@ -428,9 +418,7 @@ static inline struct obj_cgroup *__folio_objcg(struct folio *folio) * * - the folio lock * - LRU isolation - * - folio_memcg_lock() * - exclusive reference - * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -499,9 +487,7 @@ static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) * * - the folio lock * - LRU isolation - * - lock_folio_memcg() * - exclusive reference - * - mem_cgroup_trylock_pages() * * For a kmem folio a caller should hold an rcu read lock to protect memcg * associated with a kmem folio from being released. @@ -1867,26 +1853,6 @@ static inline bool task_in_memcg_oom(struct task_struct *p) return p->memcg_in_oom; } -void folio_memcg_lock(struct folio *folio); -void folio_memcg_unlock(struct folio *folio); - -/* try to stablize folio_memcg() for all the pages in a memcg */ -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - rcu_read_lock(); - - if (mem_cgroup_disabled() || !atomic_read(&memcg->moving_account)) - return true; - - rcu_read_unlock(); - return false; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline void mem_cgroup_enter_user_fault(void) { WARN_ON(current->in_user_fault); @@ -1908,26 +1874,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, return 0; } -static inline void folio_memcg_lock(struct folio *folio) -{ -} - -static inline void folio_memcg_unlock(struct folio *folio) -{ -} - -static inline bool mem_cgroup_trylock_pages(struct mem_cgroup *memcg) -{ - /* to match folio_memcg_rcu() */ - rcu_read_lock(); - return true; -} - -static inline void mem_cgroup_unlock_pages(void) -{ - rcu_read_unlock(); -} - static inline bool task_in_memcg_oom(struct task_struct *p) { return false; diff --git a/mm/filemap.c b/mm/filemap.c index 630a1c431ea15..e582a1545d2ae 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -119,7 +119,6 @@ * ->i_pages lock (folio_remove_rmap_pte->set_page_dirty) * bdi.wb->list_lock (folio_remove_rmap_pte->set_page_dirty) * ->inode->i_lock (folio_remove_rmap_pte->set_page_dirty) - * ->memcg->move_lock (folio_remove_rmap_pte->folio_memcg_lock) * bdi.wb->list_lock (zap_pte_range->set_page_dirty) * ->inode->i_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->block_dirty_folio) diff --git a/mm/memcontrol-v1.c b/mm/memcontrol-v1.c index 9c0fba8c8a833..539ceefa9d2d0 100644 --- a/mm/memcontrol-v1.c +++ b/mm/memcontrol-v1.c @@ -401,87 +401,6 @@ unsigned long memcg1_soft_limit_reclaim(pg_data_t *pgdat, int order, return nr_reclaimed; } -/** - * folio_memcg_lock - Bind a folio to its memcg. - * @folio: The folio. - * - * This function prevents unlocked LRU folios from being moved to - * another cgroup. - * - * It ensures lifetime of the bound memcg. The caller is responsible - * for the lifetime of the folio. - */ -void folio_memcg_lock(struct folio *folio) -{ - struct mem_cgroup *memcg; - unsigned long flags; - - /* - * The RCU lock is held throughout the transaction. The fast - * path can get away without acquiring the memcg->move_lock - * because page moving starts with an RCU grace period. - */ - rcu_read_lock(); - - if (mem_cgroup_disabled()) - return; -again: - memcg = folio_memcg(folio); - if (unlikely(!memcg)) - return; - -#ifdef CONFIG_PROVE_LOCKING - local_irq_save(flags); - might_lock(&memcg->move_lock); - local_irq_restore(flags); -#endif - - if (atomic_read(&memcg->moving_account) <= 0) - return; - - spin_lock_irqsave(&memcg->move_lock, flags); - if (memcg != folio_memcg(folio)) { - spin_unlock_irqrestore(&memcg->move_lock, flags); - goto again; - } - - /* - * When charge migration first begins, we can have multiple - * critical sections holding the fast-path RCU lock and one - * holding the slowpath move_lock. Track the task who has the - * move_lock for folio_memcg_unlock(). - */ - memcg->move_lock_task = current; - memcg->move_lock_flags = flags; -} - -static void __folio_memcg_unlock(struct mem_cgroup *memcg) -{ - if (memcg && memcg->move_lock_task == current) { - unsigned long flags = memcg->move_lock_flags; - - memcg->move_lock_task = NULL; - memcg->move_lock_flags = 0; - - spin_unlock_irqrestore(&memcg->move_lock, flags); - } - - rcu_read_unlock(); -} - -/** - * folio_memcg_unlock - Release the binding between a folio and its memcg. - * @folio: The folio. - * - * This releases the binding created by folio_memcg_lock(). This does - * not change the accounting of this folio to its memcg, but it does - * permit others to change it. - */ -void folio_memcg_unlock(struct folio *folio) -{ - __folio_memcg_unlock(folio_memcg(folio)); -} - static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -1189,7 +1108,6 @@ void memcg1_memcg_init(struct mem_cgroup *memcg) { INIT_LIST_HEAD(&memcg->oom_notify); mutex_init(&memcg->thresholds_lock); - spin_lock_init(&memcg->move_lock); INIT_LIST_HEAD(&memcg->event_list); spin_lock_init(&memcg->event_list_lock); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ec4ac6eb650bc..39e902c1dd9fd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1189,7 +1189,6 @@ void lruvec_memcg_debug(struct lruvec *lruvec, struct folio *folio) * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false - * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held. @@ -1211,7 +1210,6 @@ struct lruvec *folio_lruvec_lock(struct folio *folio) * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false - * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts @@ -1235,7 +1233,6 @@ struct lruvec *folio_lruvec_lock_irq(struct folio *folio) * These functions are safe to use under any of the following conditions: * - folio locked * - folio_test_lru false - * - folio_memcg_lock() * - folio frozen (refcount of 0) * * Return: The lruvec this folio is on with its lock held and interrupts @@ -2375,9 +2372,7 @@ static void commit_charge(struct folio *folio, struct mem_cgroup *memcg) * * - the page lock * - LRU isolation - * - folio_memcg_lock() * - exclusive reference - * - mem_cgroup_trylock_pages() */ folio->memcg_data = (unsigned long)memcg; } diff --git a/mm/rmap.c b/mm/rmap.c index 73d5998677d40..4d75433330f97 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -32,7 +32,6 @@ * swap_lock (in swap_duplicate, swap_info_get) * mmlist_lock (in mmput, drain_mmlist and others) * mapping->private_lock (in block_dirty_folio) - * folio_lock_memcg move_lock (in block_dirty_folio) * i_pages lock (widely used) * lruvec->lru_lock (in folio_lruvec_lock_irq) * inode->i_lock (in set_page_dirty's __mark_inode_dirty) From c14f8046cd7c353176c53d2721d52a2bd6a648ec Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Oct 2024 13:26:23 +0100 Subject: [PATCH 120/215] tools: testing: add additional vma_internal.h stubs Patch series "fix error handling in mmap_region() and refactor", v3. The mmap_region() function is somewhat terrifying, with spaghetti-like control flow and numerous means by which issues can arise and incomplete state, memory leaks and other unpleasantness can occur. This series goes to great lengths to simplify how mmap_region() works and to avoid unwinding errors late on in the process of setting up the VMA for the new mapping, and equally avoids such operations occurring while the VMA is in an inconsistent state. This series builds on the previously submitted hotfix patches (see link to v2 below) which addresses the most critical issues around mmap_region(), and further works to improve mmap_region() complexity, stability, and testability. This series moves the code to mm/vma.c to render it userland testable, refactors and simplifies it into smaller functions that are significantly more readable. It additionally avoids performing an attempt at a second merge mid-way through allocating a new VMA, a dubious proposition at best and one that is highly subject to subtle bugs. Rather than do this, we simply note that we ought to retry the merge and do this as a final step. This patch (of 3): Add some additional vma_internal.h stubs in preparation for __mmap_region() being moved to mm/vma.c. Without these the move would result in the tests no longer compiling. Link: https://lkml.kernel.org/r/cover.1729858176.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/74b27e159e261d2ac1fe66a130edad1d61fdc176.1729858176.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Peter Xu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/vma/vma_internal.h | 115 ++++++++++++++++++++++++++++++- 1 file changed, 114 insertions(+), 1 deletion(-) diff --git a/tools/testing/vma/vma_internal.h b/tools/testing/vma/vma_internal.h index c5b9da034511c..e76ff579e1fdc 100644 --- a/tools/testing/vma/vma_internal.h +++ b/tools/testing/vma/vma_internal.h @@ -44,7 +44,9 @@ #define VM_LOCKED 0x00002000 #define VM_IO 0x00004000 #define VM_DONTEXPAND 0x00040000 +#define VM_LOCKONFAULT 0x00080000 #define VM_ACCOUNT 0x00100000 +#define VM_NORESERVE 0x00200000 #define VM_MIXEDMAP 0x10000000 #define VM_STACK VM_GROWSDOWN #define VM_SHADOW_STACK VM_NONE @@ -53,6 +55,14 @@ #define VM_ACCESS_FLAGS (VM_READ | VM_WRITE | VM_EXEC) #define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_PFNMAP | VM_MIXEDMAP) +/* This mask represents all the VMA flag bits used by mlock */ +#define VM_LOCKED_MASK (VM_LOCKED | VM_LOCKONFAULT) + +#ifdef CONFIG_64BIT +/* VM is sealed, in vm_flags */ +#define VM_SEALED _BITUL(63) +#endif + #define FIRST_USER_ADDRESS 0UL #define USER_PGTABLES_CEILING 0UL @@ -698,8 +708,9 @@ static inline void tlb_finish_mmu(struct mmu_gather *) { } -static inline void get_file(struct file *) +static inline struct file *get_file(struct file *f) { + return f; } static inline int vma_dup_policy(struct vm_area_struct *, struct vm_area_struct *) @@ -920,4 +931,106 @@ static inline bool signal_pending(void *) return false; } +static inline bool is_file_hugepages(struct file *) +{ + return false; +} + +static inline int security_vm_enough_memory_mm(struct mm_struct *, long) +{ + return true; +} + +static inline bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long) +{ + return true; +} + +static inline void vm_flags_init(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma->__vm_flags = flags; +} + +static inline void vm_flags_set(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma->__vm_flags |= flags; +} + +static inline void vm_flags_clear(struct vm_area_struct *vma, + vm_flags_t flags) +{ + vma_start_write(vma); + vma->__vm_flags &= ~flags; +} + +static inline int call_mmap(struct file *, struct vm_area_struct *) +{ + return 0; +} + +static inline int shmem_zero_setup(struct vm_area_struct *) +{ + return 0; +} + +static inline void vma_set_anonymous(struct vm_area_struct *vma) +{ + vma->vm_ops = NULL; +} + +static inline void ksm_add_vma(struct vm_area_struct *) +{ +} + +static inline void perf_event_mmap(struct vm_area_struct *) +{ +} + +static inline bool vma_is_dax(struct vm_area_struct *) +{ + return false; +} + +static inline struct vm_area_struct *get_gate_vma(struct mm_struct *) +{ + return NULL; +} + +bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); + +/* Update vma->vm_page_prot to reflect vma->vm_flags. */ +static inline void vma_set_page_prot(struct vm_area_struct *vma) +{ + unsigned long vm_flags = vma->vm_flags; + pgprot_t vm_page_prot; + + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vma->vm_page_prot, vm_get_page_prot(vm_flags)); + + if (vma_wants_writenotify(vma, vm_page_prot)) { + vm_flags &= ~VM_SHARED; + /* testing: we inline vm_pgprot_modify() to avoid clash with vma.h. */ + vm_page_prot = pgprot_modify(vm_page_prot, vm_get_page_prot(vm_flags)); + } + /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */ + WRITE_ONCE(vma->vm_page_prot, vm_page_prot); +} + +static inline bool arch_validate_flags(unsigned long) +{ + return true; +} + +static inline void vma_close(struct vm_area_struct *) +{ +} + +static inline int mmap_file(struct file *, struct vm_area_struct *) +{ + return 0; +} + #endif /* __MM_VMA_INTERNAL_H */ From 52956b0d7fb92e3b39513dda91951ca419afc63a Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Oct 2024 13:26:24 +0100 Subject: [PATCH 121/215] mm: isolate mmap internal logic to mm/vma.c In previous commits we effected improvements to the mmap() logic in mmap_region() and its newly introduced internal implementation function __mmap_region(). However as these changes are intended to be backported, we kept the delta as small as is possible and made as few changes as possible to the newly introduced mm/vma.* files. Take the opportunity to move this logic to mm/vma.c which not only isolates it, but also makes it available for later userland testing which can help us catch such logic errors far earlier. Link: https://lkml.kernel.org/r/93fc2c3aa37dd30590b7e4ee067dfd832007bf7e.1729858176.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/mmap.c | 234 --------------------------------- mm/vma.c | 323 +++++++++++++++++++++++++++++++++++++++++++++- mm/vma.h | 97 +------------- mm/vma_internal.h | 5 + 4 files changed, 329 insertions(+), 330 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index fb91b2cb55615..f904b3bba9627 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -577,22 +577,6 @@ SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) } #endif /* __ARCH_WANT_SYS_OLD_MMAP */ -/* - * We account for memory if it's a private writeable mapping, - * not hugepages and VM_NORESERVE wasn't set. - */ -static inline bool accountable_mapping(struct file *file, vm_flags_t vm_flags) -{ - /* - * hugetlb has its own accounting separate from the core VM - * VM_HUGETLB may not be set yet so we cannot check for that flag. - */ - if (file && is_file_hugepages(file)) - return false; - - return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; -} - /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used @@ -1362,224 +1346,6 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_vmi_munmap(&vmi, mm, start, len, uf, false); } -static unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - pgoff_t pglen = PHYS_PFN(len); - unsigned long charged = 0; - struct vma_munmap_struct vms; - struct ma_state mas_detach; - struct maple_tree mt_detach; - unsigned long end = addr + len; - int error; - VMA_ITERATOR(vmi, mm, addr); - VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); - - vmg.file = file; - /* Find the first overlapping VMA */ - vma = vma_find(&vmi, end); - init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); - if (vma) { - mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_on_stack(mt_detach); - mas_init(&mas_detach, &mt_detach, /* addr = */ 0); - /* Prepare to unmap any existing mapping in the area */ - error = vms_gather_munmap_vmas(&vms, &mas_detach); - if (error) - goto gather_failed; - - vmg.next = vms.next; - vmg.prev = vms.prev; - vma = NULL; - } else { - vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); - } - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { - error = -ENOMEM; - goto abort_munmap; - } - - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = pglen; - charged -= vms.nr_accounted; - if (charged) { - error = security_vm_enough_memory_mm(mm, charged); - if (error) - goto abort_munmap; - } - - vms.nr_accounted = 0; - vm_flags |= VM_ACCOUNT; - vmg.flags = vm_flags; - } - - /* - * clear PTEs while the vma is still in the tree so that rmap - * cannot race with the freeing later in the truncate scenario. - * This is also needed for mmap_file(), which is why vm_ops - * close function is called. - */ - vms_clean_up_area(&vms, &mas_detach); - vma = vma_merge_new_range(&vmg); - if (vma) - goto expanded; - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma_iter_config(&vmi, addr, end); - vma_set_range(vma, addr, end, pgoff); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); - - if (vma_iter_prealloc(&vmi, vma)) { - error = -ENOMEM; - goto free_vma; - } - - if (file) { - vma->vm_file = get_file(file); - error = mmap_file(file, vma); - if (error) - goto unmap_and_free_file_vma; - - /* Drivers cannot alter the address of the VMA. */ - WARN_ON_ONCE(addr != vma->vm_start); - /* - * Drivers should not permit writability when previously it was - * disallowed. - */ - VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && - !(vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); - - vma_iter_config(&vmi, addr, end); - /* - * If vm_flags changed after mmap_file(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { - struct vm_area_struct *merge; - - vmg.flags = vma->vm_flags; - /* If this fails, state is reset ready for a reattempt. */ - merge = vma_merge_new_range(&vmg); - - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - vm_flags = vma->vm_flags; - goto file_expanded; - } - vma_iter_config(&vmi, addr, end); - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_iter_vma; - } else { - vma_set_anonymous(vma); - } - -#ifdef CONFIG_SPARC64 - /* TODO: Fix SPARC ADI! */ - WARN_ON_ONCE(!arch_validate_flags(vm_flags)); -#endif - - /* Lock the VMA since it is modified after insertion into VMA tree */ - vma_start_write(vma); - vma_iter_store(&vmi, vma); - mm->map_count++; - vma_link_file(vma); - - /* - * vma_merge_new_range() calls khugepaged_enter_vma() too, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - -file_expanded: - file = vma->vm_file; - ksm_add_vma(vma); -expanded: - perf_event_mmap(vma); - - /* Unmap any existing mapping in the area */ - vms_complete_munmap_vmas(&vms, &mas_detach); - - vm_stat_account(mm, vm_flags, pglen); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vm_flags_clear(vma, VM_LOCKED_MASK); - else - mm->locked_vm += pglen; - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vm_flags_set(vma, VM_SOFTDIRTY); - - vma_set_page_prot(vma); - - return addr; - -unmap_and_free_file_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - vma_iter_set(&vmi, vma->vm_end); - /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); -free_iter_vma: - vma_iter_free(&vmi); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - -abort_munmap: - vms_abort_munmap_vmas(&vms, &mas_detach); -gather_failed: - return error; -} - unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) diff --git a/mm/vma.c b/mm/vma.c index bb7cfa2dc2827..0a2965be582dd 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -1103,7 +1103,7 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, vms->clear_ptes = false; } -void vms_clean_up_area(struct vma_munmap_struct *vms, +static void vms_clean_up_area(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; @@ -1126,7 +1126,7 @@ void vms_clean_up_area(struct vma_munmap_struct *vms, * used for the munmap() and may downgrade the lock - if requested. Everything * needed to be done once the vma maple tree is updated. */ -void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, +static void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *vma; @@ -1167,6 +1167,23 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, __mt_destroy(mas_detach->tree); } +/* + * reattach_vmas() - Undo any munmap work and free resources + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas and free up the maple tree used to track the vmas. + */ +static void reattach_vmas(struct ma_state *mas_detach) +{ + struct vm_area_struct *vma; + + mas_set(mas_detach, 0); + mas_for_each(mas_detach, vma, ULONG_MAX) + vma_mark_detached(vma, false); + + __mt_destroy(mas_detach->tree); +} + /* * vms_gather_munmap_vmas() - Put all VMAs within a range into a maple tree * for removal at a later date. Handles splitting first and last if necessary @@ -1177,7 +1194,7 @@ void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, * * Return: 0 on success, error otherwise */ -int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, +static int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, struct ma_state *mas_detach) { struct vm_area_struct *next = NULL; @@ -1315,6 +1332,39 @@ int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, return error; } +/* + * init_vma_munmap() - Initializer wrapper for vma_munmap_struct + * @vms: The vma munmap struct + * @vmi: The vma iterator + * @vma: The first vm_area_struct to munmap + * @start: The aligned start address to munmap + * @end: The aligned end address to munmap + * @uf: The userfaultfd list_head + * @unlock: Unlock after the operation. Only unlocked on success + */ +static void init_vma_munmap(struct vma_munmap_struct *vms, + struct vma_iterator *vmi, struct vm_area_struct *vma, + unsigned long start, unsigned long end, struct list_head *uf, + bool unlock) +{ + vms->vmi = vmi; + vms->vma = vma; + if (vma) { + vms->start = start; + vms->end = end; + } else { + vms->start = vms->end = 0; + } + vms->unlock = unlock; + vms->uf = uf; + vms->vma_count = 0; + vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; + vms->exec_vm = vms->stack_vm = vms->data_vm = 0; + vms->unmap_start = FIRST_USER_ADDRESS; + vms->unmap_end = USER_PGTABLES_CEILING; + vms->clear_ptes = false; +} + /* * do_vmi_align_munmap() - munmap the aligned region from @start to @end. * @vmi: The vma iterator @@ -2069,3 +2119,270 @@ void mm_drop_all_locks(struct mm_struct *mm) mutex_unlock(&mm_all_locks_mutex); } + +/* + * We account for memory if it's a private writeable mapping, + * not hugepages and VM_NORESERVE wasn't set. + */ +static bool accountable_mapping(struct file *file, vm_flags_t vm_flags) +{ + /* + * hugetlb has its own accounting separate from the core VM + * VM_HUGETLB may not be set yet so we cannot check for that flag. + */ + if (file && is_file_hugepages(file)) + return false; + + return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; +} + +/* + * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() + * operation. + * @vms: The vma unmap structure + * @mas_detach: The maple state with the detached maple tree + * + * Reattach any detached vmas, free up the maple tree used to track the vmas. + * If that's not possible because the ptes are cleared (and vm_ops->closed() may + * have been called), then a NULL is written over the vmas and the vmas are + * removed (munmap() completed). + */ +static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, + struct ma_state *mas_detach) +{ + struct ma_state *mas = &vms->vmi->mas; + + if (!vms->nr_pages) + return; + + if (vms->clear_ptes) + return reattach_vmas(mas_detach); + + /* + * Aborting cannot just call the vm_ops open() because they are often + * not symmetrical and state data has been lost. Resort to the old + * failure method of leaving a gap where the MAP_FIXED mapping failed. + */ + mas_set_range(mas, vms->start, vms->end - 1); + mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); + /* Clean up the insertion of the unfortunate gap */ + vms_complete_munmap_vmas(vms, mas_detach); +} + +unsigned long __mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + pgoff_t pglen = PHYS_PFN(len); + unsigned long charged = 0; + struct vma_munmap_struct vms; + struct ma_state mas_detach; + struct maple_tree mt_detach; + unsigned long end = addr + len; + int error; + VMA_ITERATOR(vmi, mm, addr); + VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); + + vmg.file = file; + /* Find the first overlapping VMA */ + vma = vma_find(&vmi, end); + init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); + if (vma) { + mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(mt_detach); + mas_init(&mas_detach, &mt_detach, /* addr = */ 0); + /* Prepare to unmap any existing mapping in the area */ + error = vms_gather_munmap_vmas(&vms, &mas_detach); + if (error) + goto gather_failed; + + vmg.next = vms.next; + vmg.prev = vms.prev; + vma = NULL; + } else { + vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); + } + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { + error = -ENOMEM; + goto abort_munmap; + } + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = pglen; + charged -= vms.nr_accounted; + if (charged) { + error = security_vm_enough_memory_mm(mm, charged); + if (error) + goto abort_munmap; + } + + vms.nr_accounted = 0; + vm_flags |= VM_ACCOUNT; + vmg.flags = vm_flags; + } + + /* + * clear PTEs while the vma is still in the tree so that rmap + * cannot race with the freeing later in the truncate scenario. + * This is also needed for mmap_file(), which is why vm_ops + * close function is called. + */ + vms_clean_up_area(&vms, &mas_detach); + vma = vma_merge_new_range(&vmg); + if (vma) + goto expanded; + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma_iter_config(&vmi, addr, end); + vma_set_range(vma, addr, end, pgoff); + vm_flags_init(vma, vm_flags); + vma->vm_page_prot = vm_get_page_prot(vm_flags); + + if (vma_iter_prealloc(&vmi, vma)) { + error = -ENOMEM; + goto free_vma; + } + + if (file) { + vma->vm_file = get_file(file); + error = mmap_file(file, vma); + if (error) + goto unmap_and_free_file_vma; + + /* Drivers cannot alter the address of the VMA. */ + WARN_ON_ONCE(addr != vma->vm_start); + /* + * Drivers should not permit writability when previously it was + * disallowed. + */ + VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && + !(vm_flags & VM_MAYWRITE) && + (vma->vm_flags & VM_MAYWRITE)); + + vma_iter_config(&vmi, addr, end); + /* + * If vm_flags changed after mmap_file(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { + struct vm_area_struct *merge; + + vmg.flags = vma->vm_flags; + /* If this fails, state is reset ready for a reattempt. */ + merge = vma_merge_new_range(&vmg); + + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + vm_flags = vma->vm_flags; + goto file_expanded; + } + vma_iter_config(&vmi, addr, end); + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_iter_vma; + } else { + vma_set_anonymous(vma); + } + +#ifdef CONFIG_SPARC64 + /* TODO: Fix SPARC ADI! */ + WARN_ON_ONCE(!arch_validate_flags(vm_flags)); +#endif + + /* Lock the VMA since it is modified after insertion into VMA tree */ + vma_start_write(vma); + vma_iter_store(&vmi, vma); + mm->map_count++; + vma_link_file(vma); + + /* + * vma_merge_new_range() calls khugepaged_enter_vma() too, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + +file_expanded: + file = vma->vm_file; + ksm_add_vma(vma); +expanded: + perf_event_mmap(vma); + + /* Unmap any existing mapping in the area */ + vms_complete_munmap_vmas(&vms, &mas_detach); + + vm_stat_account(mm, vm_flags, pglen); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vm_flags_clear(vma, VM_LOCKED_MASK); + else + mm->locked_vm += pglen; + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vm_flags_set(vma, VM_SOFTDIRTY); + + vma_set_page_prot(vma); + + return addr; + +unmap_and_free_file_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + vma_iter_set(&vmi, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); +free_iter_vma: + vma_iter_free(&vmi); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + +abort_munmap: + vms_abort_munmap_vmas(&vms, &mas_detach); +gather_failed: + return error; +} diff --git a/mm/vma.h b/mm/vma.h index d58068c0ff2ea..388d347486744 100644 --- a/mm/vma.h +++ b/mm/vma.h @@ -165,99 +165,6 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } -#ifdef CONFIG_MMU -/* - * init_vma_munmap() - Initializer wrapper for vma_munmap_struct - * @vms: The vma munmap struct - * @vmi: The vma iterator - * @vma: The first vm_area_struct to munmap - * @start: The aligned start address to munmap - * @end: The aligned end address to munmap - * @uf: The userfaultfd list_head - * @unlock: Unlock after the operation. Only unlocked on success - */ -static inline void init_vma_munmap(struct vma_munmap_struct *vms, - struct vma_iterator *vmi, struct vm_area_struct *vma, - unsigned long start, unsigned long end, struct list_head *uf, - bool unlock) -{ - vms->vmi = vmi; - vms->vma = vma; - if (vma) { - vms->start = start; - vms->end = end; - } else { - vms->start = vms->end = 0; - } - vms->unlock = unlock; - vms->uf = uf; - vms->vma_count = 0; - vms->nr_pages = vms->locked_vm = vms->nr_accounted = 0; - vms->exec_vm = vms->stack_vm = vms->data_vm = 0; - vms->unmap_start = FIRST_USER_ADDRESS; - vms->unmap_end = USER_PGTABLES_CEILING; - vms->clear_ptes = false; -} -#endif - -int vms_gather_munmap_vmas(struct vma_munmap_struct *vms, - struct ma_state *mas_detach); - -void vms_complete_munmap_vmas(struct vma_munmap_struct *vms, - struct ma_state *mas_detach); - -void vms_clean_up_area(struct vma_munmap_struct *vms, - struct ma_state *mas_detach); - -/* - * reattach_vmas() - Undo any munmap work and free resources - * @mas_detach: The maple state with the detached maple tree - * - * Reattach any detached vmas and free up the maple tree used to track the vmas. - */ -static inline void reattach_vmas(struct ma_state *mas_detach) -{ - struct vm_area_struct *vma; - - mas_set(mas_detach, 0); - mas_for_each(mas_detach, vma, ULONG_MAX) - vma_mark_detached(vma, false); - - __mt_destroy(mas_detach->tree); -} - -/* - * vms_abort_munmap_vmas() - Undo as much as possible from an aborted munmap() - * operation. - * @vms: The vma unmap structure - * @mas_detach: The maple state with the detached maple tree - * - * Reattach any detached vmas, free up the maple tree used to track the vmas. - * If that's not possible because the ptes are cleared (and vm_ops->closed() may - * have been called), then a NULL is written over the vmas and the vmas are - * removed (munmap() completed). - */ -static inline void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, - struct ma_state *mas_detach) -{ - struct ma_state *mas = &vms->vmi->mas; - if (!vms->nr_pages) - return; - - if (vms->clear_ptes) - return reattach_vmas(mas_detach); - - /* - * Aborting cannot just call the vm_ops open() because they are often - * not symmetrical and state data has been lost. Resort to the old - * failure method of leaving a gap where the MAP_FIXED mapping failed. - */ - mas_set_range(mas, vms->start, vms->end - 1); - mas_store_gfp(mas, NULL, GFP_KERNEL|__GFP_NOFAIL); - /* Clean up the insertion of the unfortunate gap */ - vms_complete_munmap_vmas(vms, mas_detach); -} - int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, @@ -336,6 +243,10 @@ bool vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot); int mm_take_all_locks(struct mm_struct *mm); void mm_drop_all_locks(struct mm_struct *mm); +unsigned long __mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf); + static inline bool vma_wants_manual_pte_write_upgrade(struct vm_area_struct *vma) { /* diff --git a/mm/vma_internal.h b/mm/vma_internal.h index b930ab12a5878..fc5f172a36bd7 100644 --- a/mm/vma_internal.h +++ b/mm/vma_internal.h @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -32,11 +34,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include From 0d11630cc50a625662a973b5ab5f448aaf59cb23 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Oct 2024 13:26:25 +0100 Subject: [PATCH 122/215] mm: refactor __mmap_region() We have seen bugs and resource leaks arise from the complexity of the __mmap_region() function. This, and the generally deeply fragile error handling logic and complexity which makes understanding the function difficult make it highly desirable to refactor it into something readable. Achieve this by separating the function into smaller logical parts which are easier to understand and follow, and which importantly very significantly simplify the error handling. Note that we now call vms_abort_munmap_vmas() in more error paths than we used to, however in cases where no abort need occur, vms->nr_pages will be equal to zero and we simply exit this function without doing more than we would have done previously. Importantly, the invocation of the driver mmap hook via mmap_file() now has very simple and obvious handling (this was previously the most problematic part of the mmap() operation). Use a generalised stack-based 'mmap state' to thread through values and also retrieve state as needed. Also avoid ever relying on vma merge (vmg) state after a merge is attempted, instead maintain meaningful state in the mmap state and establish vmg state as and when required. This avoids any subtle bugs arising from merge logic mutating this state and mmap_region() logic later relying upon it. Link: https://lkml.kernel.org/r/25bd2edc3275450f448cbfe0756ce2a7cd06810f.1729858176.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/vma.c | 410 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 270 insertions(+), 140 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 0a2965be582dd..b91c947babd6c 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -7,6 +7,56 @@ #include "vma_internal.h" #include "vma.h" +struct mmap_state { + struct mm_struct *mm; + struct vma_iterator *vmi; + + unsigned long addr; + unsigned long end; + pgoff_t pgoff; + unsigned long pglen; + unsigned long flags; + struct file *file; + + unsigned long charged; + + struct vm_area_struct *prev; + struct vm_area_struct *next; + + /* Unmapping state. */ + struct vma_munmap_struct vms; + struct ma_state mas_detach; + struct maple_tree mt_detach; +}; + +#define MMAP_STATE(name, mm_, vmi_, addr_, len_, pgoff_, flags_, file_) \ + struct mmap_state name = { \ + .mm = mm_, \ + .vmi = vmi_, \ + .addr = addr_, \ + .end = (addr_) + len, \ + .pgoff = pgoff_, \ + .pglen = PHYS_PFN(len_), \ + .flags = flags_, \ + .file = file_, \ + } + +#define VMG_MMAP_STATE(name, map_, vma_) \ + struct vma_merge_struct name = { \ + .mm = (map_)->mm, \ + .vmi = (map_)->vmi, \ + .start = (map_)->addr, \ + .end = (map_)->end, \ + .flags = (map_)->flags, \ + .pgoff = (map_)->pgoff, \ + .file = (map_)->file, \ + .prev = (map_)->prev, \ + .vma = vma_, \ + .next = (vma_) ? NULL : (map_)->next, \ + .state = VMA_MERGE_START, \ + .merge_flags = VMG_FLAG_DEFAULT, \ + } + static inline bool is_mergeable_vma(struct vma_merge_struct *vmg, bool merge_next) { struct vm_area_struct *vma = merge_next ? vmg->next : vmg->prev; @@ -2169,188 +2219,249 @@ static void vms_abort_munmap_vmas(struct vma_munmap_struct *vms, vms_complete_munmap_vmas(vms, mas_detach); } -unsigned long __mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) +/* + * __mmap_prepare() - Prepare to gather any overlapping VMAs that need to be + * unmapped once the map operation is completed, check limits, account mapping + * and clean up any pre-existing VMAs. + * + * @map: Mapping state. + * @uf: Userfaultfd context list. + * + * Returns: 0 on success, error code otherwise. + */ +static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - pgoff_t pglen = PHYS_PFN(len); - unsigned long charged = 0; - struct vma_munmap_struct vms; - struct ma_state mas_detach; - struct maple_tree mt_detach; - unsigned long end = addr + len; int error; - VMA_ITERATOR(vmi, mm, addr); - VMG_STATE(vmg, mm, &vmi, addr, end, vm_flags, pgoff); - - vmg.file = file; - /* Find the first overlapping VMA */ - vma = vma_find(&vmi, end); - init_vma_munmap(&vms, &vmi, vma, addr, end, uf, /* unlock = */ false); - if (vma) { - mt_init_flags(&mt_detach, vmi.mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); - mt_on_stack(mt_detach); - mas_init(&mas_detach, &mt_detach, /* addr = */ 0); + struct vma_iterator *vmi = map->vmi; + struct vma_munmap_struct *vms = &map->vms; + + /* Find the first overlapping VMA and initialise unmap state. */ + vms->vma = vma_find(vmi, map->end); + init_vma_munmap(vms, vmi, vms->vma, map->addr, map->end, uf, + /* unlock = */ false); + + /* OK, we have overlapping VMAs - prepare to unmap them. */ + if (vms->vma) { + mt_init_flags(&map->mt_detach, + vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK); + mt_on_stack(map->mt_detach); + mas_init(&map->mas_detach, &map->mt_detach, /* addr = */ 0); /* Prepare to unmap any existing mapping in the area */ - error = vms_gather_munmap_vmas(&vms, &mas_detach); - if (error) - goto gather_failed; + error = vms_gather_munmap_vmas(vms, &map->mas_detach); + if (error) { + /* On error VMAs will already have been reattached. */ + vms->nr_pages = 0; + return error; + } - vmg.next = vms.next; - vmg.prev = vms.prev; - vma = NULL; + map->next = vms->next; + map->prev = vms->prev; } else { - vmg.next = vma_iter_next_rewind(&vmi, &vmg.prev); + map->next = vma_iter_next_rewind(vmi, &map->prev); } /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, pglen - vms.nr_pages)) { - error = -ENOMEM; - goto abort_munmap; - } + if (!may_expand_vm(map->mm, map->flags, map->pglen - vms->nr_pages)) + return -ENOMEM; - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = pglen; - charged -= vms.nr_accounted; - if (charged) { - error = security_vm_enough_memory_mm(mm, charged); + /* Private writable mapping: check memory availability. */ + if (accountable_mapping(map->file, map->flags)) { + map->charged = map->pglen; + map->charged -= vms->nr_accounted; + if (map->charged) { + error = security_vm_enough_memory_mm(map->mm, map->charged); if (error) - goto abort_munmap; + return error; } - vms.nr_accounted = 0; - vm_flags |= VM_ACCOUNT; - vmg.flags = vm_flags; + vms->nr_accounted = 0; + map->flags |= VM_ACCOUNT; } /* - * clear PTEs while the vma is still in the tree so that rmap + * Clear PTEs while the vma is still in the tree so that rmap * cannot race with the freeing later in the truncate scenario. * This is also needed for mmap_file(), which is why vm_ops * close function is called. */ - vms_clean_up_area(&vms, &mas_detach); - vma = vma_merge_new_range(&vmg); - if (vma) - goto expanded; + vms_clean_up_area(vms, &map->mas_detach); + + return 0; +} + +static int __mmap_new_file_vma(struct mmap_state *map, + struct vm_area_struct **vmap, bool *mergedp) +{ + struct vma_iterator *vmi = map->vmi; + struct vm_area_struct *vma = *vmap; + int error; + + vma->vm_file = get_file(map->file); + error = mmap_file(vma->vm_file, vma); + if (error) { + fput(vma->vm_file); + vma->vm_file = NULL; + + vma_iter_set(vmi, vma->vm_end); + /* Undo any partial mapping done by a device driver. */ + unmap_region(&vmi->mas, vma, map->prev, map->next); + + return error; + } + + /* Drivers cannot alter the address of the VMA. */ + WARN_ON_ONCE(map->addr != vma->vm_start); + /* + * Drivers should not permit writability when previously it was + * disallowed. + */ + VM_WARN_ON_ONCE(map->flags != vma->vm_flags && + !(map->flags & VM_MAYWRITE) && + (vma->vm_flags & VM_MAYWRITE)); + + /* mmap_file() might have changed VMA flags. */ + map->flags = vma->vm_flags; + + vma_iter_config(vmi, map->addr, map->end); + /* + * If flags changed after mmap_file(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(map->flags != vma->vm_flags && map->prev)) { + struct vm_area_struct *merge; + VMG_MMAP_STATE(vmg, map, /* vma = */ NULL); + + merge = vma_merge_new_range(&vmg); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + *mergedp = true; + } else { + vma_iter_config(vmi, map->addr, map->end); + } + } + + *vmap = vma; + return 0; +} + +/* + * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not + * possible. + * + * An exception to this is if the mapping is file-backed, and the underlying + * driver changes the VMA flags, permitting a subsequent merge of the VMA, in + * which case the returned VMA is one that was merged on a second attempt. + * + * @map: Mapping state. + * @vmap: Output pointer for the new VMA. + * + * Returns: Zero on success, or an error. + */ +static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) +{ + struct vma_iterator *vmi = map->vmi; + int error = 0; + bool merged = false; + struct vm_area_struct *vma; + /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but * not unmapped, but the maps are removed from the list. */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } + vma = vm_area_alloc(map->mm); + if (!vma) + return -ENOMEM; - vma_iter_config(&vmi, addr, end); - vma_set_range(vma, addr, end, pgoff); - vm_flags_init(vma, vm_flags); - vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma_iter_config(vmi, map->addr, map->end); + vma_set_range(vma, map->addr, map->end, map->pgoff); + vm_flags_init(vma, map->flags); + vma->vm_page_prot = vm_get_page_prot(map->flags); - if (vma_iter_prealloc(&vmi, vma)) { + if (vma_iter_prealloc(vmi, vma)) { error = -ENOMEM; goto free_vma; } - if (file) { - vma->vm_file = get_file(file); - error = mmap_file(file, vma); - if (error) - goto unmap_and_free_file_vma; - - /* Drivers cannot alter the address of the VMA. */ - WARN_ON_ONCE(addr != vma->vm_start); - /* - * Drivers should not permit writability when previously it was - * disallowed. - */ - VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && - !(vm_flags & VM_MAYWRITE) && - (vma->vm_flags & VM_MAYWRITE)); - - vma_iter_config(&vmi, addr, end); - /* - * If vm_flags changed after mmap_file(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && vmg.prev)) { - struct vm_area_struct *merge; - - vmg.flags = vma->vm_flags; - /* If this fails, state is reset ready for a reattempt. */ - merge = vma_merge_new_range(&vmg); - - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - vm_flags = vma->vm_flags; - goto file_expanded; - } - vma_iter_config(&vmi, addr, end); - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { + if (map->file) + error = __mmap_new_file_vma(map, &vma, &merged); + else if (map->flags & VM_SHARED) error = shmem_zero_setup(vma); - if (error) - goto free_iter_vma; - } else { + else vma_set_anonymous(vma); - } + + if (error) + goto free_iter_vma; + + if (merged) + goto file_expanded; #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ - WARN_ON_ONCE(!arch_validate_flags(vm_flags)); + WARN_ON_ONCE(!arch_validate_flags(map->flags)); #endif /* Lock the VMA since it is modified after insertion into VMA tree */ vma_start_write(vma); - vma_iter_store(&vmi, vma); - mm->map_count++; + vma_iter_store(vmi, vma); + map->mm->map_count++; vma_link_file(vma); /* * vma_merge_new_range() calls khugepaged_enter_vma() too, the below * call covers the non-merge case. */ - khugepaged_enter_vma(vma, vma->vm_flags); + khugepaged_enter_vma(vma, map->flags); file_expanded: - file = vma->vm_file; ksm_add_vma(vma); -expanded: + *vmap = vma; + return 0; + +free_iter_vma: + vma_iter_free(vmi); +free_vma: + vm_area_free(vma); + return error; +} + +/* + * __mmap_complete() - Unmap any VMAs we overlap, account memory mapping + * statistics, handle locking and finalise the VMA. + * + * @map: Mapping state. + * @vma: Merged or newly allocated VMA for the mmap()'d region. + */ +static void __mmap_complete(struct mmap_state *map, struct vm_area_struct *vma) +{ + struct mm_struct *mm = map->mm; + unsigned long vm_flags = vma->vm_flags; + perf_event_mmap(vma); - /* Unmap any existing mapping in the area */ - vms_complete_munmap_vmas(&vms, &mas_detach); + /* Unmap any existing mapping in the area. */ + vms_complete_munmap_vmas(&map->vms, &map->mas_detach); - vm_stat_account(mm, vm_flags, pglen); + vm_stat_account(mm, vma->vm_flags, map->pglen); if (vm_flags & VM_LOCKED) { if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) + vma == get_gate_vma(mm)) vm_flags_clear(vma, VM_LOCKED_MASK); else - mm->locked_vm += pglen; + mm->locked_vm += map->pglen; } - if (file) + if (vma->vm_file) uprobe_mmap(vma); /* @@ -2363,26 +2474,45 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, vm_flags_set(vma, VM_SOFTDIRTY); vma_set_page_prot(vma); +} - return addr; +unsigned long __mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + int error; + VMA_ITERATOR(vmi, mm, addr); + MMAP_STATE(map, mm, &vmi, addr, len, pgoff, vm_flags, file); -unmap_and_free_file_vma: - fput(vma->vm_file); - vma->vm_file = NULL; + error = __mmap_prepare(&map, uf); + if (error) + goto abort_munmap; - vma_iter_set(&vmi, vma->vm_end); - /* Undo any partial mapping done by a device driver. */ - unmap_region(&vmi.mas, vma, vmg.prev, vmg.next); -free_iter_vma: - vma_iter_free(&vmi); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); + /* Attempt to merge with adjacent VMAs... */ + if (map.prev || map.next) { + VMG_MMAP_STATE(vmg, &map, /* vma = */ NULL); + + vma = vma_merge_new_range(&vmg); + } + /* ...but if we can't, allocate a new VMA. */ + if (!vma) { + error = __mmap_new_vma(&map, &vma); + if (error) + goto unacct_error; + } + + __mmap_complete(&map, vma); + + return addr; + + /* Accounting was done by __mmap_prepare(). */ +unacct_error: + if (map.charged) + vm_unacct_memory(map.charged); abort_munmap: - vms_abort_munmap_vmas(&vms, &mas_detach); -gather_failed: + vms_abort_munmap_vmas(&map.vms, &map.mas_detach); return error; } From 5a689bac0bbc1ddad1e9f87b574f3d409643759c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Oct 2024 13:26:26 +0100 Subject: [PATCH 123/215] mm: remove unnecessary reset state logic on merge new VMA The only place where this was used was in mmap_region(), which we have now adjusted to not require this to be performed (we reset ourselves in effect). It also created a dangerous assumption that VMG state could be safely reused after a merge, at which point it may have been mutated in unexpected ways, leading to subtle bugs. Note that it was discovered by Wei Yang that there was also an error in this code - we are comparing vmg->vma with prev after setting it to NULL. This however had no impact, as we previously reset VMA iterator state before attempting merge again, but it was useless effort. In any case, this patch removes all of the logic so also eliminates this wasted effort. Link: https://lkml.kernel.org/r/5d9a59eee6498ae017cc87d89aa723de7179f75d.1729858176.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/vma.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index b91c947babd6c..7c690be679103 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -963,7 +963,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) struct vm_area_struct *next = vmg->next; unsigned long start = vmg->start; unsigned long end = vmg->end; - pgoff_t pgoff = vmg->pgoff; pgoff_t pglen = PHYS_PFN(end - start); bool can_merge_left, can_merge_right; bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; @@ -1020,16 +1019,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) return vmg->vma; } - /* If expansion failed, reset state. Allows us to retry merge later. */ - if (!just_expand) { - vmg->vma = NULL; - vmg->start = start; - vmg->end = end; - vmg->pgoff = pgoff; - if (vmg->vma == prev) - vma_iter_set(vmg->vmi, start); - } - return NULL; } From 5ac87a885aecb3fa2aae04215410882757a2ef06 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 25 Oct 2024 13:26:27 +0100 Subject: [PATCH 124/215] mm: defer second attempt at merge on mmap() Rather than trying to merge again when ostensibly allocating a new VMA, instead defer until the VMA is added and attempt to merge the existing range. This way we have no complicated unwinding logic midway through the process of mapping the VMA. In addition this removes limitations on the VMA not being able to be the first in the virtual memory address space which was previously implicitly required. In theory, for this very same reason, we should unconditionally attempt merge here, however this is likely to have a performance impact so it is better to avoid this given the unlikely outcome of a merge. [lorenzo.stoakes@oracle.com: remove unnecessary indirection] Link: https://lkml.kernel.org/r/5106696d-e625-4d8a-8545-9d1430301730@lucifer.local Link: https://lkml.kernel.org/r/d4f84502605d7651ac114587f507395c0fc76004.1729858176.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Vlastimil Babka Cc: Jann Horn Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/vma.c | 56 ++++++++++++++------------------------------------------ 1 file changed, 14 insertions(+), 42 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index 7c690be679103..c26bbc898f853 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -19,6 +19,7 @@ struct mmap_state { struct file *file; unsigned long charged; + bool retry_merge; struct vm_area_struct *prev; struct vm_area_struct *next; @@ -2278,11 +2279,11 @@ static int __mmap_prepare(struct mmap_state *map, struct list_head *uf) return 0; } + static int __mmap_new_file_vma(struct mmap_state *map, - struct vm_area_struct **vmap, bool *mergedp) + struct vm_area_struct *vma) { struct vma_iterator *vmi = map->vmi; - struct vm_area_struct *vma = *vmap; int error; vma->vm_file = get_file(map->file); @@ -2308,37 +2309,10 @@ static int __mmap_new_file_vma(struct mmap_state *map, !(map->flags & VM_MAYWRITE) && (vma->vm_flags & VM_MAYWRITE)); - /* mmap_file() might have changed VMA flags. */ + /* If the flags change (and are mergeable), let's retry later. */ + map->retry_merge = vma->vm_flags != map->flags && !(vma->vm_flags & VM_SPECIAL); map->flags = vma->vm_flags; - vma_iter_config(vmi, map->addr, map->end); - /* - * If flags changed after mmap_file(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(map->flags != vma->vm_flags && map->prev)) { - struct vm_area_struct *merge; - VMG_MMAP_STATE(vmg, map, /* vma = */ NULL); - - merge = vma_merge_new_range(&vmg); - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - *mergedp = true; - } else { - vma_iter_config(vmi, map->addr, map->end); - } - } - - *vmap = vma; return 0; } @@ -2346,10 +2320,6 @@ static int __mmap_new_file_vma(struct mmap_state *map, * __mmap_new_vma() - Allocate a new VMA for the region, as merging was not * possible. * - * An exception to this is if the mapping is file-backed, and the underlying - * driver changes the VMA flags, permitting a subsequent merge of the VMA, in - * which case the returned VMA is one that was merged on a second attempt. - * * @map: Mapping state. * @vmap: Output pointer for the new VMA. * @@ -2359,7 +2329,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) { struct vma_iterator *vmi = map->vmi; int error = 0; - bool merged = false; struct vm_area_struct *vma; /* @@ -2382,7 +2351,7 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) } if (map->file) - error = __mmap_new_file_vma(map, &vma, &merged); + error = __mmap_new_file_vma(map, vma); else if (map->flags & VM_SHARED) error = shmem_zero_setup(vma); else @@ -2391,9 +2360,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) if (error) goto free_iter_vma; - if (merged) - goto file_expanded; - #ifdef CONFIG_SPARC64 /* TODO: Fix SPARC ADI! */ WARN_ON_ONCE(!arch_validate_flags(map->flags)); @@ -2410,8 +2376,6 @@ static int __mmap_new_vma(struct mmap_state *map, struct vm_area_struct **vmap) * call covers the non-merge case. */ khugepaged_enter_vma(vma, map->flags); - -file_expanded: ksm_add_vma(vma); *vmap = vma; return 0; @@ -2493,6 +2457,14 @@ unsigned long __mmap_region(struct file *file, unsigned long addr, goto unacct_error; } + /* If flags changed, we might be able to merge, so try again. */ + if (map.retry_merge) { + VMG_MMAP_STATE(vmg, &map, vma); + + vma_iter_config(map.vmi, map.addr, map.end); + vma_merge_existing_range(&vmg); + } + __mmap_complete(&map, vma); return addr; From 642c66d84cd4c0506698ae52d0c6fd12d3695c01 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 24 Oct 2024 09:33:47 +0000 Subject: [PATCH 125/215] mm/vma: the pgoff is correct if can_merge_right By this point can_vma_merge_right() must have returned true, which implies can_vma_merge_before() also returned true, which already asserts that the pgoff is as expected for a merge with the following VMA, thus this assignment is redundant. Below is a more detail explanation. Current definition of can_vma_merge_right() is: static bool can_vma_merge_right(struct vma_merge_struct *vmg, bool can_merge_left) { if (!vmg->next || vmg->end != vmg->next->vm_start || !can_vma_merge_before(vmg)) return false; ... } And: static bool can_vma_merge_before(struct vma_merge_struct *vmg) { pgoff_t pglen = PHYS_PFN(vmg->end - vmg->start); ... if (vmg->next->vm_pgoff == vmg->pgoff + pglen) return true; ... } Which implies vmg->pgoff == vmg->next->vm_pgoff - pglen. None of these values are changed between the check and prior assignment, so this was an entirely redundant assignment. [akpm@linux-foundation.org: remove now-unused local] [lorenzo.stoakes@oracle.com: rephrase the changelog] Link: https://lkml.kernel.org/r/20241024093347.18057-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Lorenzo Stoakes Cc: Jann Horn Cc: Vlastimil Babka Cc: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/vma.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/vma.c b/mm/vma.c index c26bbc898f853..68138e8c153e6 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -962,9 +962,7 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) { struct vm_area_struct *prev = vmg->prev; struct vm_area_struct *next = vmg->next; - unsigned long start = vmg->start; unsigned long end = vmg->end; - pgoff_t pglen = PHYS_PFN(end - start); bool can_merge_left, can_merge_right; bool just_expand = vmg->merge_flags & VMG_FLAG_JUST_EXPAND; @@ -986,7 +984,6 @@ struct vm_area_struct *vma_merge_new_range(struct vma_merge_struct *vmg) if (can_merge_right) { vmg->end = next->vm_end; vmg->vma = next; - vmg->pgoff = next->vm_pgoff - pglen; } /* If we can merge with the previous VMA, adjust vmg accordingly. */ From 906c38ff52e95575ddf3281bee531eded3dba150 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Sat, 26 Oct 2024 09:37:07 -0700 Subject: [PATCH 126/215] memcg: workingset: remove folio_memcg_rcu usage The function workingset_activation() is called from folio_mark_accessed() with the guarantee that the given folio can not be freed under us in workingset_activation(). In addition, the association of the folio and its memcg can not be broken here because charge migration is no more. There is no need to use folio_memcg_rcu. Simply use folio_memcg_charged() because that is what this function cares about. [akpm@linux-foundation.org: provide folio_memcg_charged stub for CONFIG_MEMCG=n] Link: https://lkml.kernel.org/r/20241026163707.2479526-1-shakeel.butt@linux.dev Signed-off-by: Shakeel Butt Suggested-by: Yu Zhao Cc: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Muchun Song Cc: Hugh Dickins Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 34 ++-------------------------------- mm/workingset.c | 14 ++------------ 2 files changed, 4 insertions(+), 44 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index bb49e0d4b377a..8e4608be811d0 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -443,35 +443,6 @@ static inline bool folio_memcg_charged(struct folio *folio) return __folio_memcg(folio) != NULL; } -/** - * folio_memcg_rcu - Locklessly get the memory cgroup associated with a folio. - * @folio: Pointer to the folio. - * - * This function assumes that the folio is known to have a - * proper memory cgroup pointer. It's not safe to call this function - * against some type of folios, e.g. slab folios or ex-slab folios. - * - * Return: A pointer to the memory cgroup associated with the folio, - * or NULL. - */ -static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) -{ - unsigned long memcg_data = READ_ONCE(folio->memcg_data); - - VM_BUG_ON_FOLIO(folio_test_slab(folio), folio); - - if (memcg_data & MEMCG_DATA_KMEM) { - struct obj_cgroup *objcg; - - objcg = (void *)(memcg_data & ~OBJEXTS_FLAGS_MASK); - return obj_cgroup_memcg(objcg); - } - - WARN_ON_ONCE(!rcu_read_lock_held()); - - return (struct mem_cgroup *)(memcg_data & ~OBJEXTS_FLAGS_MASK); -} - /* * folio_memcg_check - Get the memory cgroup associated with a folio. * @folio: Pointer to the folio. @@ -1084,10 +1055,9 @@ static inline struct mem_cgroup *folio_memcg(struct folio *folio) return NULL; } -static inline struct mem_cgroup *folio_memcg_rcu(struct folio *folio) +static inline bool folio_memcg_charged(struct folio *folio) { - WARN_ON_ONCE(!rcu_read_lock_held()); - return NULL; + return false; } static inline struct mem_cgroup *folio_memcg_check(struct folio *folio) diff --git a/mm/workingset.c b/mm/workingset.c index a2b28e356e68e..0e38bec261a41 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -591,22 +591,12 @@ void workingset_refault(struct folio *folio, void *shadow) */ void workingset_activation(struct folio *folio) { - struct mem_cgroup *memcg; - - rcu_read_lock(); /* * Filter non-memcg pages here, e.g. unmap can call * mark_page_accessed() on VDSO pages. - * - * XXX: See workingset_refault() - this should return - * root_mem_cgroup even for !CONFIG_MEMCG. */ - memcg = folio_memcg_rcu(folio); - if (!mem_cgroup_disabled() && !memcg) - goto out; - workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); -out: - rcu_read_unlock(); + if (mem_cgroup_disabled() || folio_memcg_charged(folio)) + workingset_age_nonresident(folio_lruvec(folio), folio_nr_pages(folio)); } /* From beeb9220c7307fbb61a2cd6575907db52bde722f Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:04 +0300 Subject: [PATCH 127/215] mm: vmalloc: group declarations depending on CONFIG_MMU together Patch series "x86/module: use large ROX pages for text allocations", v7. These patches add support for using large ROX pages for allocations of executable memory on x86. They address Andy's comments [1] about having executable mappings for code that was not completely formed. The approach taken is to allocate ROX memory along with writable but not executable memory and use the writable copy to perform relocations and alternatives patching. After the module text gets into its final shape, the contents of the writable memory is copied into the actual ROX location using text poking. The allocations of the ROX memory use vmalloc(VMAP_ALLOW_HUGE_MAP) to allocate PMD aligned memory, fill that memory with invalid instructions and in the end remap it as ROX. Portions of these large pages are handed out to execmem_alloc() callers without any changes to the permissions. When the memory is freed with execmem_free() it is invalidated again so that it won't contain stale instructions. The module memory allocation, x86 code dealing with relocations and alternatives patching take into account the existence of the two copies, the writable memory and the ROX memory at the actual allocated virtual address. [1] https://lore.kernel.org/all/a17c65c6-863f-4026-9c6f-a04b659e9ab4@app.fastmail.com This patch (of 8): There are a couple of declarations that depend on CONFIG_MMU in include/linux/vmalloc.h spread all over the file. Group them all together to improve code readability. No functional changes. Link: https://lkml.kernel.org/r/20241023162711.2579610-1-rppt@kernel.org Link: https://lkml.kernel.org/r/20241023162711.2579610-2-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/vmalloc.h | 60 +++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 36 deletions(-) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ad2ce7a6ab7af..27408f21e501e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -134,12 +134,6 @@ extern void vm_unmap_ram(const void *mem, unsigned int count); extern void *vm_map_ram(struct page **pages, unsigned int count, int node); extern void vm_unmap_aliases(void); -#ifdef CONFIG_MMU -extern unsigned long vmalloc_nr_pages(void); -#else -static inline unsigned long vmalloc_nr_pages(void) { return 0; } -#endif - extern void *vmalloc_noprof(unsigned long size) __alloc_size(1); #define vmalloc(...) alloc_hooks(vmalloc_noprof(__VA_ARGS__)) @@ -266,12 +260,29 @@ static inline bool is_vm_area_hugepages(const void *addr) #endif } +/* for /proc/kcore */ +long vread_iter(struct iov_iter *iter, const char *addr, size_t count); + +/* + * Internals. Don't use.. + */ +__init void vm_area_add_early(struct vm_struct *vm); +__init void vm_area_register_early(struct vm_struct *vm, size_t align); + +int register_vmap_purge_notifier(struct notifier_block *nb); +int unregister_vmap_purge_notifier(struct notifier_block *nb); + #ifdef CONFIG_MMU +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) + +unsigned long vmalloc_nr_pages(void); + int vm_area_map_pages(struct vm_struct *area, unsigned long start, unsigned long end, struct page **pages); void vm_area_unmap_pages(struct vm_struct *area, unsigned long start, unsigned long end); void vunmap_range(unsigned long addr, unsigned long end); + static inline void set_vm_flush_reset_perms(void *addr) { struct vm_struct *vm = find_vm_area(addr); @@ -279,24 +290,14 @@ static inline void set_vm_flush_reset_perms(void *addr) if (vm) vm->flags |= VM_FLUSH_RESET_PERMS; } +#else /* !CONFIG_MMU */ +#define VMALLOC_TOTAL 0UL -#else -static inline void set_vm_flush_reset_perms(void *addr) -{ -} -#endif - -/* for /proc/kcore */ -extern long vread_iter(struct iov_iter *iter, const char *addr, size_t count); - -/* - * Internals. Don't use.. - */ -extern __init void vm_area_add_early(struct vm_struct *vm); -extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); +static inline unsigned long vmalloc_nr_pages(void) { return 0; } +static inline void set_vm_flush_reset_perms(void *addr) {} +#endif /* CONFIG_MMU */ -#ifdef CONFIG_SMP -# ifdef CONFIG_MMU +#if defined(CONFIG_MMU) && defined(CONFIG_SMP) struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, size_t align); @@ -311,22 +312,9 @@ pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } -static inline void -pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) -{ -} -# endif -#endif - -#ifdef CONFIG_MMU -#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) -#else -#define VMALLOC_TOTAL 0UL +static inline void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) {} #endif -int register_vmap_purge_notifier(struct notifier_block *nb); -int unregister_vmap_purge_notifier(struct notifier_block *nb); - #if defined(CONFIG_MMU) && defined(CONFIG_PRINTK) bool vmalloc_dump_obj(void *object); #else From c82be0be957631b7eaa4b84ba458e1826484e60d Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:05 +0300 Subject: [PATCH 128/215] mm: vmalloc: don't account for number of nodes for HUGE_VMAP allocations vmalloc allocations with VM_ALLOW_HUGE_VMAP that do not explicitly specify node ID will use huge pages only if size_per_node is larger than a huge page. Still the actual allocated memory is not distributed between nodes and there is no advantage in such approach. On the contrary, BPF allocates SZ_2M * num_possible_nodes() for each new bpf_prog_pack, while it could do with a single huge page per pack. Don't account for number of nodes for VM_ALLOW_HUGE_VMAP with NUMA_NO_NODE and use huge pages whenever the requested allocation size is larger than a huge page. Link: https://lkml.kernel.org/r/20241023162711.2579610-3-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vmalloc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5480b77f4167d..5c0ea4e2b17d7 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3779,8 +3779,6 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, } if (vmap_allow_huge && (vm_flags & VM_ALLOW_HUGE_VMAP)) { - unsigned long size_per_node; - /* * Try huge pages. Only try for PAGE_KERNEL allocations, * others like modules don't yet expect huge pages in @@ -3788,13 +3786,10 @@ void *__vmalloc_node_range_noprof(unsigned long size, unsigned long align, * supporting them. */ - size_per_node = size; - if (node == NUMA_NO_NODE) - size_per_node /= num_online_nodes(); - if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE) + if (arch_vmap_pmd_supported(prot) && size >= PMD_SIZE) shift = PMD_SHIFT; else - shift = arch_vmap_pte_supported_shift(size_per_node); + shift = arch_vmap_pte_supported_shift(size); align = max(real_align, 1UL << shift); size = ALIGN(real_size, 1UL << shift); From 0c3beacf681ec897e0b36685a9b49d01f5cb2dfb Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:06 +0300 Subject: [PATCH 129/215] asm-generic: introduce text-patching.h Several architectures support text patching, but they name the header files that declare patching functions differently. Make all such headers consistently named text-patching.h and add an empty header in asm-generic for architectures that do not support text patching. Link: https://lkml.kernel.org/r/20241023162711.2579610-4-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Acked-by: Geert Uytterhoeven # m68k Acked-by: Arnd Bergmann Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/alpha/include/asm/Kbuild | 1 + arch/arc/include/asm/Kbuild | 1 + arch/arm/include/asm/{patch.h => text-patching.h} | 0 arch/arm/kernel/ftrace.c | 2 +- arch/arm/kernel/jump_label.c | 2 +- arch/arm/kernel/kgdb.c | 2 +- arch/arm/kernel/patch.c | 2 +- arch/arm/probes/kprobes/core.c | 2 +- arch/arm/probes/kprobes/opt-arm.c | 2 +- .../include/asm/{patching.h => text-patching.h} | 0 arch/arm64/kernel/ftrace.c | 2 +- arch/arm64/kernel/jump_label.c | 2 +- arch/arm64/kernel/kgdb.c | 2 +- arch/arm64/kernel/patching.c | 2 +- arch/arm64/kernel/probes/kprobes.c | 2 +- arch/arm64/kernel/traps.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 2 +- arch/csky/include/asm/Kbuild | 1 + arch/hexagon/include/asm/Kbuild | 1 + arch/loongarch/include/asm/Kbuild | 1 + arch/m68k/include/asm/Kbuild | 1 + arch/microblaze/include/asm/Kbuild | 1 + arch/mips/include/asm/Kbuild | 1 + arch/nios2/include/asm/Kbuild | 1 + arch/openrisc/include/asm/Kbuild | 1 + .../include/asm/{patch.h => text-patching.h} | 0 arch/parisc/kernel/ftrace.c | 2 +- arch/parisc/kernel/jump_label.c | 2 +- arch/parisc/kernel/kgdb.c | 2 +- arch/parisc/kernel/kprobes.c | 2 +- arch/parisc/kernel/patch.c | 2 +- arch/powerpc/include/asm/kprobes.h | 2 +- .../asm/{code-patching.h => text-patching.h} | 0 arch/powerpc/kernel/crash_dump.c | 2 +- arch/powerpc/kernel/epapr_paravirt.c | 2 +- arch/powerpc/kernel/jump_label.c | 2 +- arch/powerpc/kernel/kgdb.c | 2 +- arch/powerpc/kernel/kprobes.c | 2 +- arch/powerpc/kernel/module_32.c | 2 +- arch/powerpc/kernel/module_64.c | 2 +- arch/powerpc/kernel/optprobes.c | 2 +- arch/powerpc/kernel/process.c | 2 +- arch/powerpc/kernel/security.c | 2 +- arch/powerpc/kernel/setup_32.c | 2 +- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kernel/static_call.c | 2 +- arch/powerpc/kernel/trace/ftrace.c | 2 +- arch/powerpc/kernel/trace/ftrace_64_pg.c | 2 +- arch/powerpc/lib/code-patching.c | 2 +- arch/powerpc/lib/feature-fixups.c | 2 +- arch/powerpc/lib/test-code-patching.c | 2 +- arch/powerpc/lib/test_emulate_step.c | 2 +- arch/powerpc/mm/book3s32/mmu.c | 2 +- arch/powerpc/mm/book3s64/hash_utils.c | 2 +- arch/powerpc/mm/book3s64/slb.c | 2 +- arch/powerpc/mm/kasan/init_32.c | 2 +- arch/powerpc/mm/mem.c | 2 +- arch/powerpc/mm/nohash/44x.c | 2 +- arch/powerpc/mm/nohash/book3e_pgtable.c | 2 +- arch/powerpc/mm/nohash/tlb.c | 2 +- arch/powerpc/mm/nohash/tlb_64e.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/powerpc/perf/8xx-pmu.c | 2 +- arch/powerpc/perf/core-book3s.c | 2 +- arch/powerpc/platforms/85xx/smp.c | 2 +- arch/powerpc/platforms/86xx/mpc86xx_smp.c | 2 +- arch/powerpc/platforms/cell/smp.c | 2 +- arch/powerpc/platforms/powermac/smp.c | 2 +- arch/powerpc/platforms/powernv/idle.c | 2 +- arch/powerpc/platforms/powernv/smp.c | 2 +- arch/powerpc/platforms/pseries/smp.c | 2 +- arch/powerpc/xmon/xmon.c | 2 +- arch/riscv/errata/andes/errata.c | 2 +- arch/riscv/errata/sifive/errata.c | 2 +- arch/riscv/errata/thead/errata.c | 2 +- .../include/asm/{patch.h => text-patching.h} | 0 arch/riscv/include/asm/uprobes.h | 2 +- arch/riscv/kernel/alternative.c | 2 +- arch/riscv/kernel/cpufeature.c | 3 ++- arch/riscv/kernel/ftrace.c | 2 +- arch/riscv/kernel/jump_label.c | 2 +- arch/riscv/kernel/patch.c | 2 +- arch/riscv/kernel/probes/kprobes.c | 2 +- arch/riscv/net/bpf_jit_comp64.c | 2 +- arch/riscv/net/bpf_jit_core.c | 2 +- arch/sh/include/asm/Kbuild | 1 + arch/sparc/include/asm/Kbuild | 1 + arch/um/kernel/um_arch.c | 5 +++++ arch/x86/include/asm/text-patching.h | 1 + arch/xtensa/include/asm/Kbuild | 1 + include/asm-generic/text-patching.h | 5 +++++ include/linux/text-patching.h | 15 +++++++++++++++ 92 files changed, 110 insertions(+), 70 deletions(-) rename arch/arm/include/asm/{patch.h => text-patching.h} (100%) rename arch/arm64/include/asm/{patching.h => text-patching.h} (100%) rename arch/parisc/include/asm/{patch.h => text-patching.h} (100%) rename arch/powerpc/include/asm/{code-patching.h => text-patching.h} (100%) rename arch/riscv/include/asm/{patch.h => text-patching.h} (100%) create mode 100644 include/asm-generic/text-patching.h create mode 100644 include/linux/text-patching.h diff --git a/arch/alpha/include/asm/Kbuild b/arch/alpha/include/asm/Kbuild index 396caece6d6d9..483965c5a4de2 100644 --- a/arch/alpha/include/asm/Kbuild +++ b/arch/alpha/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += agp.h generic-y += asm-offsets.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/arc/include/asm/Kbuild b/arch/arc/include/asm/Kbuild index 49285a3ce2398..4c69522e0328e 100644 --- a/arch/arc/include/asm/Kbuild +++ b/arch/arc/include/asm/Kbuild @@ -6,3 +6,4 @@ generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/arm/include/asm/patch.h b/arch/arm/include/asm/text-patching.h similarity index 100% rename from arch/arm/include/asm/patch.h rename to arch/arm/include/asm/text-patching.h diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index e61591f33a6cd..845acf9ce21e3 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include /* * The compiler emitted profiling hook consists of diff --git a/arch/arm/kernel/jump_label.c b/arch/arm/kernel/jump_label.c index eb9c24b6e8e23..a06a92d0f5508 100644 --- a/arch/arm/kernel/jump_label.c +++ b/arch/arm/kernel/jump_label.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include -#include +#include #include static void __arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c index 22f937e6f3ffb..ab76c55fd610c 100644 --- a/arch/arm/kernel/kgdb.c +++ b/arch/arm/kernel/kgdb.c @@ -15,7 +15,7 @@ #include #include -#include +#include #include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = diff --git a/arch/arm/kernel/patch.c b/arch/arm/kernel/patch.c index e9e828b6bb306..4d45e60cd46d1 100644 --- a/arch/arm/kernel/patch.c +++ b/arch/arm/kernel/patch.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include struct patch { void *addr; diff --git a/arch/arm/probes/kprobes/core.c b/arch/arm/probes/kprobes/core.c index d8238da095df7..9fd877c87a38f 100644 --- a/arch/arm/probes/kprobes/core.c +++ b/arch/arm/probes/kprobes/core.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "../decode-arm.h" diff --git a/arch/arm/probes/kprobes/opt-arm.c b/arch/arm/probes/kprobes/opt-arm.c index 7f65048380ca5..966c6042c5ad7 100644 --- a/arch/arm/probes/kprobes/opt-arm.c +++ b/arch/arm/probes/kprobes/opt-arm.c @@ -14,7 +14,7 @@ /* for arm_gen_branch */ #include /* for patch_text */ -#include +#include #include "core.h" diff --git a/arch/arm64/include/asm/patching.h b/arch/arm64/include/asm/text-patching.h similarity index 100% rename from arch/arm64/include/asm/patching.h rename to arch/arm64/include/asm/text-patching.h diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index a650f5e11fc5d..3575d03d60aff 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #ifdef CONFIG_DYNAMIC_FTRACE_WITH_ARGS struct fregs_offset { diff --git a/arch/arm64/kernel/jump_label.c b/arch/arm64/kernel/jump_label.c index f63ea915d6ad2..b345425193d28 100644 --- a/arch/arm64/kernel/jump_label.c +++ b/arch/arm64/kernel/jump_label.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include bool arch_jump_label_transform_queue(struct jump_entry *entry, enum jump_label_type type) diff --git a/arch/arm64/kernel/kgdb.c b/arch/arm64/kernel/kgdb.c index 4e1f983df3d1c..f3c4d3a8a20f9 100644 --- a/arch/arm64/kernel/kgdb.c +++ b/arch/arm64/kernel/kgdb.c @@ -17,7 +17,7 @@ #include #include -#include +#include #include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { diff --git a/arch/arm64/kernel/patching.c b/arch/arm64/kernel/patching.c index 945df74005c70..7f99723fbb8c4 100644 --- a/arch/arm64/kernel/patching.c +++ b/arch/arm64/kernel/patching.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include static DEFINE_RAW_SPINLOCK(patch_lock); diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index 4268678d0e86c..01dbe9a56956b 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 563cbce111269..7d81998040869 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -41,7 +41,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 5db82bfc9dc11..1bcae29ff181a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include "bpf_jit.h" diff --git a/arch/csky/include/asm/Kbuild b/arch/csky/include/asm/Kbuild index 9a9bc65b57a9d..3a5c7f6e5aacb 100644 --- a/arch/csky/include/asm/Kbuild +++ b/arch/csky/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += qspinlock.h generic-y += parport.h generic-y += user.h generic-y += vmlinux.lds.h +generic-y += text-patching.h diff --git a/arch/hexagon/include/asm/Kbuild b/arch/hexagon/include/asm/Kbuild index 8c1a78c8f5271..1efa1e993d4b9 100644 --- a/arch/hexagon/include/asm/Kbuild +++ b/arch/hexagon/include/asm/Kbuild @@ -5,3 +5,4 @@ generic-y += extable.h generic-y += iomap.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/loongarch/include/asm/Kbuild b/arch/loongarch/include/asm/Kbuild index 5b5a6c90e6e20..80ddb5edb8455 100644 --- a/arch/loongarch/include/asm/Kbuild +++ b/arch/loongarch/include/asm/Kbuild @@ -11,3 +11,4 @@ generic-y += ioctl.h generic-y += mmzone.h generic-y += statfs.h generic-y += param.h +generic-y += text-patching.h diff --git a/arch/m68k/include/asm/Kbuild b/arch/m68k/include/asm/Kbuild index 0dbf9c5c6faeb..b282e0dd8dc10 100644 --- a/arch/m68k/include/asm/Kbuild +++ b/arch/m68k/include/asm/Kbuild @@ -4,3 +4,4 @@ generic-y += extable.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += spinlock.h +generic-y += text-patching.h diff --git a/arch/microblaze/include/asm/Kbuild b/arch/microblaze/include/asm/Kbuild index a055f5dbe00a3..7178f990e8b3d 100644 --- a/arch/microblaze/include/asm/Kbuild +++ b/arch/microblaze/include/asm/Kbuild @@ -8,3 +8,4 @@ generic-y += parport.h generic-y += syscalls.h generic-y += tlb.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/mips/include/asm/Kbuild b/arch/mips/include/asm/Kbuild index 7ba67a0d6c97b..684569b2ecd6b 100644 --- a/arch/mips/include/asm/Kbuild +++ b/arch/mips/include/asm/Kbuild @@ -13,3 +13,4 @@ generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/nios2/include/asm/Kbuild b/arch/nios2/include/asm/Kbuild index 0d09829ed1445..28004301c236f 100644 --- a/arch/nios2/include/asm/Kbuild +++ b/arch/nios2/include/asm/Kbuild @@ -7,3 +7,4 @@ generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += spinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/openrisc/include/asm/Kbuild b/arch/openrisc/include/asm/Kbuild index cef49d60d74c0..2b1a6b00cdac0 100644 --- a/arch/openrisc/include/asm/Kbuild +++ b/arch/openrisc/include/asm/Kbuild @@ -9,3 +9,4 @@ generic-y += spinlock.h generic-y += qrwlock_types.h generic-y += qrwlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/arch/parisc/include/asm/patch.h b/arch/parisc/include/asm/text-patching.h similarity index 100% rename from arch/parisc/include/asm/patch.h rename to arch/parisc/include/asm/text-patching.h diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index c91f9c2e61ed2..3e34b4473d3a7 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -20,7 +20,7 @@ #include #include #include -#include +#include #define __hot __section(".text.hot") diff --git a/arch/parisc/kernel/jump_label.c b/arch/parisc/kernel/jump_label.c index e253b134500d1..ea51f15bf0e64 100644 --- a/arch/parisc/kernel/jump_label.c +++ b/arch/parisc/kernel/jump_label.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include static inline int reassemble_17(int as17) { diff --git a/arch/parisc/kernel/kgdb.c b/arch/parisc/kernel/kgdb.c index b16fa9bac5f44..fee81f877525e 100644 --- a/arch/parisc/kernel/kgdb.c +++ b/arch/parisc/kernel/kgdb.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include const struct kgdb_arch arch_kgdb_ops = { diff --git a/arch/parisc/kernel/kprobes.c b/arch/parisc/kernel/kprobes.c index 6e0b86652f30d..9255adba67a36 100644 --- a/arch/parisc/kernel/kprobes.c +++ b/arch/parisc/kernel/kprobes.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); diff --git a/arch/parisc/kernel/patch.c b/arch/parisc/kernel/patch.c index e59574f65e641..35dd764b871e0 100644 --- a/arch/parisc/kernel/patch.c +++ b/arch/parisc/kernel/patch.c @@ -13,7 +13,7 @@ #include #include -#include +#include struct patch { void *addr; diff --git a/arch/powerpc/include/asm/kprobes.h b/arch/powerpc/include/asm/kprobes.h index 4525a9c68260d..dfe2e5ad3b216 100644 --- a/arch/powerpc/include/asm/kprobes.h +++ b/arch/powerpc/include/asm/kprobes.h @@ -21,7 +21,7 @@ #include #include #include -#include +#include #ifdef CONFIG_KPROBES #define __ARCH_WANT_KPROBES_INSN_SLOT diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/text-patching.h similarity index 100% rename from arch/powerpc/include/asm/code-patching.h rename to arch/powerpc/include/asm/text-patching.h diff --git a/arch/powerpc/kernel/crash_dump.c b/arch/powerpc/kernel/crash_dump.c index 2086fa6cdc25b..103b6605dd68f 100644 --- a/arch/powerpc/kernel/crash_dump.c +++ b/arch/powerpc/kernel/crash_dump.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/epapr_paravirt.c b/arch/powerpc/kernel/epapr_paravirt.c index d4b8aff208156..247ab2acaccca 100644 --- a/arch/powerpc/kernel/epapr_paravirt.c +++ b/arch/powerpc/kernel/epapr_paravirt.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/jump_label.c b/arch/powerpc/kernel/jump_label.c index 5277cf582c161..2659e1ac86046 100644 --- a/arch/powerpc/kernel/jump_label.c +++ b/arch/powerpc/kernel/jump_label.c @@ -5,7 +5,7 @@ #include #include -#include +#include #include void arch_jump_label_transform(struct jump_entry *entry, diff --git a/arch/powerpc/kernel/kgdb.c b/arch/powerpc/kernel/kgdb.c index 7a8bc03a00af0..5081334b7bd21 100644 --- a/arch/powerpc/kernel/kgdb.c +++ b/arch/powerpc/kernel/kgdb.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/kernel/kprobes.c b/arch/powerpc/kernel/kprobes.c index f8aa91bc3b175..9c85bbcc5201c 100644 --- a/arch/powerpc/kernel/kprobes.c +++ b/arch/powerpc/kernel/kprobes.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/module_32.c b/arch/powerpc/kernel/module_32.c index 816a63fd71fbf..f930e3395a7f2 100644 --- a/arch/powerpc/kernel/module_32.c +++ b/arch/powerpc/kernel/module_32.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include /* Count how many different relocations (different symbol, different addend) */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index e9bab599d0c27..135960918d14c 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/optprobes.c b/arch/powerpc/kernel/optprobes.c index c0b351d61058f..2e83702bf9ba6 100644 --- a/arch/powerpc/kernel/optprobes.c +++ b/arch/powerpc/kernel/optprobes.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index ff61a3e7984ce..7b739b9a91ab9 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -54,7 +54,7 @@ #include #include #endif -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index 4856e1a5161cc..fbb7ebd8aa08b 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -14,7 +14,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/setup_32.c b/arch/powerpc/kernel/setup_32.c index e515c1f7d8d33..75dbf3e0d9c4b 100644 --- a/arch/powerpc/kernel/setup_32.c +++ b/arch/powerpc/kernel/setup_32.c @@ -40,7 +40,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 22f83fbbc762a..3ebf5b9fbe984 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -60,7 +60,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/static_call.c b/arch/powerpc/kernel/static_call.c index 1502b7e439caf..7cfd0710e7579 100644 --- a/arch/powerpc/kernel/static_call.c +++ b/arch/powerpc/kernel/static_call.c @@ -2,7 +2,7 @@ #include #include -#include +#include void arch_static_call_transform(void *site, void *tramp, void *func, bool tail) { diff --git a/arch/powerpc/kernel/trace/ftrace.c b/arch/powerpc/kernel/trace/ftrace.c index d8d6b4fd9a14c..be1a245241b3b 100644 --- a/arch/powerpc/kernel/trace/ftrace.c +++ b/arch/powerpc/kernel/trace/ftrace.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/kernel/trace/ftrace_64_pg.c b/arch/powerpc/kernel/trace/ftrace_64_pg.c index 12fab1803bcf4..9e862ba552639 100644 --- a/arch/powerpc/kernel/trace/ftrace_64_pg.c +++ b/arch/powerpc/kernel/trace/ftrace_64_pg.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index acdab294b340a..af97fbb3c257e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include static int __patch_mem(void *exec_addr, unsigned long val, void *patch_addr, bool is_dword) diff --git a/arch/powerpc/lib/feature-fixups.c b/arch/powerpc/lib/feature-fixups.c index b7201ba50b2ea..587c8cf1230fb 100644 --- a/arch/powerpc/lib/feature-fixups.c +++ b/arch/powerpc/lib/feature-fixups.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/lib/test-code-patching.c b/arch/powerpc/lib/test-code-patching.c index 8cd3b32f805b0..1440d99630b33 100644 --- a/arch/powerpc/lib/test-code-patching.c +++ b/arch/powerpc/lib/test-code-patching.c @@ -6,7 +6,7 @@ #include #include -#include +#include static int __init instr_is_branch_to_addr(const u32 *instr, unsigned long addr) { diff --git a/arch/powerpc/lib/test_emulate_step.c b/arch/powerpc/lib/test_emulate_step.c index 23c7805fb7b3b..66b5b4fa16864 100644 --- a/arch/powerpc/lib/test_emulate_step.c +++ b/arch/powerpc/lib/test_emulate_step.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include #include #define MAX_SUBTESTS 16 diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 2db167f4233f7..6978344edcb4b 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index e1eadd03f1339..47b22282269c1 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/book3s64/slb.c b/arch/powerpc/mm/book3s64/slb.c index f2708c8629a52..6b783552403c6 100644 --- a/arch/powerpc/mm/book3s64/slb.c +++ b/arch/powerpc/mm/book3s64/slb.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include "internal.h" diff --git a/arch/powerpc/mm/kasan/init_32.c b/arch/powerpc/mm/kasan/init_32.c index aa9aa11927b2f..03666d790a535 100644 --- a/arch/powerpc/mm/kasan/init_32.c +++ b/arch/powerpc/mm/kasan/init_32.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include static pgprot_t __init kasan_prot_ro(void) diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 1221c561b43a0..c7708c8fad299 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/mm/nohash/44x.c b/arch/powerpc/mm/nohash/44x.c index 1beae802bb1c0..6d10c6d8be719 100644 --- a/arch/powerpc/mm/nohash/44x.c +++ b/arch/powerpc/mm/nohash/44x.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index ad2a7c26f2a00..062e8785c1bb6 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include diff --git a/arch/powerpc/mm/nohash/tlb.c b/arch/powerpc/mm/nohash/tlb.c index b653a7be4cb1d..0a650742f3a00 100644 --- a/arch/powerpc/mm/nohash/tlb.c +++ b/arch/powerpc/mm/nohash/tlb.c @@ -37,7 +37,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/mm/nohash/tlb_64e.c b/arch/powerpc/mm/nohash/tlb_64e.c index d26656b07b72c..4f925adf26959 100644 --- a/arch/powerpc/mm/nohash/tlb_64e.c +++ b/arch/powerpc/mm/nohash/tlb_64e.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 2a36cc2e7e9e2..68c6a13e6acb1 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -18,7 +18,7 @@ #include #include -#include +#include #include "bpf_jit.h" diff --git a/arch/powerpc/perf/8xx-pmu.c b/arch/powerpc/perf/8xx-pmu.c index 308a2e40d7be9..1d2972229e3a7 100644 --- a/arch/powerpc/perf/8xx-pmu.c +++ b/arch/powerpc/perf/8xx-pmu.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #define PERF_8xx_ID_CPU_CYCLES 1 diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 42867469752d7..a727cd111cac2 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/85xx/smp.c b/arch/powerpc/platforms/85xx/smp.c index e52b848b64b79..32fa5fb557c03 100644 --- a/arch/powerpc/platforms/85xx/smp.c +++ b/arch/powerpc/platforms/85xx/smp.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/platforms/86xx/mpc86xx_smp.c b/arch/powerpc/platforms/86xx/mpc86xx_smp.c index 8a7e55acf090f..9be33e41af6db 100644 --- a/arch/powerpc/platforms/86xx/mpc86xx_smp.c +++ b/arch/powerpc/platforms/86xx/mpc86xx_smp.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/cell/smp.c b/arch/powerpc/platforms/cell/smp.c index fee638fd89702..0e8f20ecca088 100644 --- a/arch/powerpc/platforms/cell/smp.c +++ b/arch/powerpc/platforms/cell/smp.c @@ -35,7 +35,7 @@ #include #include #include -#include +#include #include "interrupt.h" #include diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index d21b681f52fb0..09e7fe24fac10 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -35,7 +35,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c index ad41dffe4d929..d98b933e4984c 100644 --- a/arch/powerpc/platforms/powernv/idle.c +++ b/arch/powerpc/platforms/powernv/idle.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/powernv/smp.c b/arch/powerpc/platforms/powernv/smp.c index 8f14f0581a21b..6b746feeabe4a 100644 --- a/arch/powerpc/platforms/powernv/smp.c +++ b/arch/powerpc/platforms/powernv/smp.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c index c597711ef20a2..db99725e752bd 100644 --- a/arch/powerpc/platforms/pseries/smp.c +++ b/arch/powerpc/platforms/pseries/smp.c @@ -39,7 +39,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index e6cddbb2305f8..e76e1d5d0611e 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -50,7 +50,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/riscv/errata/andes/errata.c b/arch/riscv/errata/andes/errata.c index fc1a34faa5f3b..dcc9d1ee5ffd3 100644 --- a/arch/riscv/errata/andes/errata.c +++ b/arch/riscv/errata/andes/errata.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/riscv/errata/sifive/errata.c b/arch/riscv/errata/sifive/errata.c index cea3b96ade11a..38aac2c47845a 100644 --- a/arch/riscv/errata/sifive/errata.c +++ b/arch/riscv/errata/sifive/errata.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/riscv/errata/thead/errata.c b/arch/riscv/errata/thead/errata.c index f5120e07c3182..e24770a779323 100644 --- a/arch/riscv/errata/thead/errata.c +++ b/arch/riscv/errata/thead/errata.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include diff --git a/arch/riscv/include/asm/patch.h b/arch/riscv/include/asm/text-patching.h similarity index 100% rename from arch/riscv/include/asm/patch.h rename to arch/riscv/include/asm/text-patching.h diff --git a/arch/riscv/include/asm/uprobes.h b/arch/riscv/include/asm/uprobes.h index 3fc7deda91902..5008f76cdc275 100644 --- a/arch/riscv/include/asm/uprobes.h +++ b/arch/riscv/include/asm/uprobes.h @@ -4,7 +4,7 @@ #define _ASM_RISCV_UPROBES_H #include -#include +#include #include #define MAX_UINSN_BYTES 8 diff --git a/arch/riscv/kernel/alternative.c b/arch/riscv/kernel/alternative.c index 0128b161bfdab..7eb3cb1215c62 100644 --- a/arch/riscv/kernel/alternative.c +++ b/arch/riscv/kernel/alternative.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include struct cpu_manufacturer_info_t { unsigned long vendor_id; diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c index 3a8eeaa9310c3..826f46b21f2e8 100644 --- a/arch/riscv/kernel/cpufeature.c +++ b/arch/riscv/kernel/cpufeature.c @@ -20,7 +20,8 @@ #include #include #include -#include +#include +#include #include #include #include diff --git a/arch/riscv/kernel/ftrace.c b/arch/riscv/kernel/ftrace.c index 4b95c574fd045..a7620ef93b6ce 100644 --- a/arch/riscv/kernel/ftrace.c +++ b/arch/riscv/kernel/ftrace.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #ifdef CONFIG_DYNAMIC_FTRACE void ftrace_arch_code_modify_prepare(void) __acquires(&text_mutex) diff --git a/arch/riscv/kernel/jump_label.c b/arch/riscv/kernel/jump_label.c index 11ad789c60c69..6eee6f736f687 100644 --- a/arch/riscv/kernel/jump_label.c +++ b/arch/riscv/kernel/jump_label.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #define RISCV_INSN_NOP 0x00000013U #define RISCV_INSN_JAL 0x0000006fU diff --git a/arch/riscv/kernel/patch.c b/arch/riscv/kernel/patch.c index 34ef522f07a8c..db13c9ddf9e3d 100644 --- a/arch/riscv/kernel/patch.c +++ b/arch/riscv/kernel/patch.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include struct patch_insn { diff --git a/arch/riscv/kernel/probes/kprobes.c b/arch/riscv/kernel/probes/kprobes.c index 474a652136578..380a0e8cecc0b 100644 --- a/arch/riscv/kernel/probes/kprobes.c +++ b/arch/riscv/kernel/probes/kprobes.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include "decode-insn.h" diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c index 4cc631fa70391..ca60db75199d1 100644 --- a/arch/riscv/net/bpf_jit_comp64.c +++ b/arch/riscv/net/bpf_jit_comp64.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include #include #include "bpf_jit.h" diff --git a/arch/riscv/net/bpf_jit_core.c b/arch/riscv/net/bpf_jit_core.c index 6de753c667f42..f8cd2f70a7fb4 100644 --- a/arch/riscv/net/bpf_jit_core.c +++ b/arch/riscv/net/bpf_jit_core.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include "bpf_jit.h" diff --git a/arch/sh/include/asm/Kbuild b/arch/sh/include/asm/Kbuild index fc44d9c88b419..4d3f10ed82758 100644 --- a/arch/sh/include/asm/Kbuild +++ b/arch/sh/include/asm/Kbuild @@ -3,3 +3,4 @@ generated-y += syscall_table.h generic-y += kvm_para.h generic-y += mcs_spinlock.h generic-y += parport.h +generic-y += text-patching.h diff --git a/arch/sparc/include/asm/Kbuild b/arch/sparc/include/asm/Kbuild index 43b0ae4c2c211..17ee8a273aa6b 100644 --- a/arch/sparc/include/asm/Kbuild +++ b/arch/sparc/include/asm/Kbuild @@ -4,3 +4,4 @@ generated-y += syscall_table_64.h generic-y += agp.h generic-y += kvm_para.h generic-y += mcs_spinlock.h +generic-y += text-patching.h diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index 8e594cda6d778..f8de31a0c5d12 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -468,6 +468,11 @@ void *text_poke(void *addr, const void *opcode, size_t len) return memcpy(addr, opcode, len); } +void *text_poke_copy(void *addr, const void *opcode, size_t len) +{ + return text_poke(addr, opcode, len); +} + void text_poke_sync(void) { } diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 6259f1937fe77..ab9e143ec9fea 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -35,6 +35,7 @@ extern void *text_poke(void *addr, const void *opcode, size_t len); extern void text_poke_sync(void); extern void *text_poke_kgdb(void *addr, const void *opcode, size_t len); extern void *text_poke_copy(void *addr, const void *opcode, size_t len); +#define text_poke_copy text_poke_copy extern void *text_poke_copy_locked(void *addr, const void *opcode, size_t len, bool core_ok); extern void *text_poke_set(void *addr, int c, size_t len); extern int poke_int3_handler(struct pt_regs *regs); diff --git a/arch/xtensa/include/asm/Kbuild b/arch/xtensa/include/asm/Kbuild index fa07c686cbcc2..cc5dba738389c 100644 --- a/arch/xtensa/include/asm/Kbuild +++ b/arch/xtensa/include/asm/Kbuild @@ -8,3 +8,4 @@ generic-y += parport.h generic-y += qrwlock.h generic-y += qspinlock.h generic-y += user.h +generic-y += text-patching.h diff --git a/include/asm-generic/text-patching.h b/include/asm-generic/text-patching.h new file mode 100644 index 0000000000000..2245c641b741a --- /dev/null +++ b/include/asm-generic/text-patching.h @@ -0,0 +1,5 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_TEXT_PATCHING_H +#define _ASM_GENERIC_TEXT_PATCHING_H + +#endif /* _ASM_GENERIC_TEXT_PATCHING_H */ diff --git a/include/linux/text-patching.h b/include/linux/text-patching.h new file mode 100644 index 0000000000000..ad5877ab08558 --- /dev/null +++ b/include/linux/text-patching.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_TEXT_PATCHING_H +#define _LINUX_TEXT_PATCHING_H + +#include + +#ifndef text_poke_copy +static inline void *text_poke_copy(void *dst, const void *src, size_t len) +{ + return memcpy(dst, src, len); +} +#define text_poke_copy text_poke_copy +#endif + +#endif /* _LINUX_TEXT_PATCHING_H */ From 0c133b1e78cd34dd9d18da707dc6f46170e9129e Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:07 +0300 Subject: [PATCH 130/215] module: prepare to handle ROX allocations for text In order to support ROX allocations for module text, it is necessary to handle modifications to the code, such as relocations and alternatives patching, without write access to that memory. One option is to use text patching, but this would make module loading extremely slow and will expose executable code that is not finally formed. A better way is to have memory allocated with ROX permissions contain invalid instructions and keep a writable, but not executable copy of the module text. The relocations and alternative patches would be done on the writable copy using the addresses of the ROX memory. Once the module is completely ready, the updated text will be copied to ROX memory using text patching in one go and the writable copy will be freed. Add support for that to module initialization code and provide necessary interfaces in execmem. Link: https://lkml.kernel.org/r/20241023162711.2579610-5-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewd-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/execmem.h | 23 +++++++++++ include/linux/module.h | 16 ++++++++ include/linux/moduleloader.h | 4 ++ kernel/module/debug_kmemleak.c | 3 +- kernel/module/main.c | 74 ++++++++++++++++++++++++++++++---- kernel/module/strict_rwx.c | 3 ++ mm/execmem.c | 11 +++++ 7 files changed, 126 insertions(+), 8 deletions(-) diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 32cef11441179..dfdf19f8a5e88 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -46,9 +46,11 @@ enum execmem_type { /** * enum execmem_range_flags - options for executable memory allocations * @EXECMEM_KASAN_SHADOW: allocate kasan shadow + * @EXECMEM_ROX_CACHE: allocations should use ROX cache of huge pages */ enum execmem_range_flags { EXECMEM_KASAN_SHADOW = (1 << 0), + EXECMEM_ROX_CACHE = (1 << 1), }; /** @@ -123,6 +125,27 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +/** + * execmem_update_copy - copy an update to executable memory + * @dst: destination address to update + * @src: source address containing the data + * @size: how many bytes of memory shold be copied + * + * Copy @size bytes from @src to @dst using text poking if the memory at + * @dst is read-only. + * + * Return: a pointer to @dst or NULL on error + */ +void *execmem_update_copy(void *dst, const void *src, size_t size); + +/** + * execmem_is_rox - check if execmem is read-only + * @type - the execmem type to check + * + * Return: %true if the @type is read-only, %false if it's writable + */ +bool execmem_is_rox(enum execmem_type type); + #if defined(CONFIG_EXECMEM) && !defined(CONFIG_ARCH_WANTS_EXECMEM_LATE) void execmem_init(void); #else diff --git a/include/linux/module.h b/include/linux/module.h index 88ecc5e9f5230..2a9386cbdf850 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -367,6 +367,8 @@ enum mod_mem_type { struct module_memory { void *base; + void *rw_copy; + bool is_rox; unsigned int size; #ifdef CONFIG_MODULES_TREE_LOOKUP @@ -767,6 +769,15 @@ static inline bool is_livepatch_module(struct module *mod) void set_module_sig_enforced(void); +void *__module_writable_address(struct module *mod, void *loc); + +static inline void *module_writable_address(struct module *mod, void *loc) +{ + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX) || !mod) + return loc; + return __module_writable_address(mod, loc); +} + #else /* !CONFIG_MODULES... */ static inline struct module *__module_address(unsigned long addr) @@ -874,6 +885,11 @@ static inline bool module_is_coming(struct module *mod) { return false; } + +static inline void *module_writable_address(struct module *mod, void *loc) +{ + return loc; +} #endif /* CONFIG_MODULES */ #ifdef CONFIG_SYSFS diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index e395461d59e5e..1f5507ba5a128 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -108,6 +108,10 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mod); +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *mod); + #ifdef CONFIG_MODULES void flush_module_init_free_work(void); #else diff --git a/kernel/module/debug_kmemleak.c b/kernel/module/debug_kmemleak.c index b4cc03842d703..df873dad049d6 100644 --- a/kernel/module/debug_kmemleak.c +++ b/kernel/module/debug_kmemleak.c @@ -14,7 +14,8 @@ void kmemleak_load_module(const struct module *mod, { /* only scan writable, non-executable sections */ for_each_mod_mem_type(type) { - if (type != MOD_DATA && type != MOD_INIT_DATA) + if (type != MOD_DATA && type != MOD_INIT_DATA && + !mod->mem[type].is_rox) kmemleak_no_scan(mod->mem[type].base); } } diff --git a/kernel/module/main.c b/kernel/module/main.c index 49b9bca9de12f..73b588fe98d44 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1189,6 +1189,18 @@ void __weak module_arch_freeing_init(struct module *mod) { } +void *__module_writable_address(struct module *mod, void *loc) +{ + for_class_mod_mem_type(type, text) { + struct module_memory *mem = &mod->mem[type]; + + if (loc >= mem->base && loc < mem->base + mem->size) + return loc + (mem->rw_copy - mem->base); + } + + return loc; +} + static int module_memory_alloc(struct module *mod, enum mod_mem_type type) { unsigned int size = PAGE_ALIGN(mod->mem[type].size); @@ -1206,6 +1218,23 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) if (!ptr) return -ENOMEM; + mod->mem[type].base = ptr; + + if (execmem_is_rox(execmem_type)) { + ptr = vzalloc(size); + + if (!ptr) { + execmem_free(mod->mem[type].base); + return -ENOMEM; + } + + mod->mem[type].rw_copy = ptr; + mod->mem[type].is_rox = true; + } else { + mod->mem[type].rw_copy = mod->mem[type].base; + memset(mod->mem[type].base, 0, size); + } + /* * The pointer to these blocks of memory are stored on the module * structure and we keep that around so long as the module is @@ -1219,16 +1248,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) */ kmemleak_not_leak(ptr); - memset(ptr, 0, size); - mod->mem[type].base = ptr; - return 0; } static void module_memory_free(struct module *mod, enum mod_mem_type type, bool unload_codetags) { - void *ptr = mod->mem[type].base; + struct module_memory *mem = &mod->mem[type]; + void *ptr = mem->base; + + if (mem->is_rox) + vfree(mem->rw_copy); if (!unload_codetags && mod_mem_type_is_core_data(type)) return; @@ -2251,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info) for_each_mod_mem_type(type) { if (!mod->mem[type].size) { mod->mem[type].base = NULL; + mod->mem[type].rw_copy = NULL; continue; } @@ -2267,11 +2298,14 @@ static int move_module(struct module *mod, struct load_info *info) void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; + unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; - dest = mod->mem[type].base + (shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK); + addr = (unsigned long)mod->mem[type].base + offset; + dest = mod->mem[type].rw_copy + offset; if (shdr->sh_type != SHT_NOBITS) { /* @@ -2293,7 +2327,7 @@ static int move_module(struct module *mod, struct load_info *info) * users of info can keep taking advantage and using the newly * minted official memory area. */ - shdr->sh_addr = (unsigned long)dest; + shdr->sh_addr = addr; pr_debug("\t0x%lx 0x%.8lx %s\n", (long)shdr->sh_addr, (long)shdr->sh_size, info->secstrings + shdr->sh_name); } @@ -2441,8 +2475,17 @@ int __weak module_finalize(const Elf_Ehdr *hdr, return 0; } +int __weak module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + return 0; +} + static int post_relocation(struct module *mod, const struct load_info *info) { + int ret; + /* Sort exception table now relocations are done. */ sort_extable(mod->extable, mod->extable + mod->num_exentries); @@ -2454,7 +2497,24 @@ static int post_relocation(struct module *mod, const struct load_info *info) add_kallsyms(mod, info); /* Arch-specific module finalizing. */ - return module_finalize(info->hdr, info->sechdrs, mod); + ret = module_finalize(info->hdr, info->sechdrs, mod); + if (ret) + return ret; + + for_each_mod_mem_type(type) { + struct module_memory *mem = &mod->mem[type]; + + if (mem->is_rox) { + if (!execmem_update_copy(mem->base, mem->rw_copy, + mem->size)) + return -ENOMEM; + + vfree(mem->rw_copy); + mem->rw_copy = NULL; + } + } + + return module_post_finalize(info->hdr, info->sechdrs, mod); } /* Call module constructors. */ diff --git a/kernel/module/strict_rwx.c b/kernel/module/strict_rwx.c index c45caa4690e53..239e5013359d9 100644 --- a/kernel/module/strict_rwx.c +++ b/kernel/module/strict_rwx.c @@ -34,6 +34,9 @@ int module_enable_text_rox(const struct module *mod) for_class_mod_mem_type(type, text) { int ret; + if (mod->mem[type].is_rox) + continue; + if (IS_ENABLED(CONFIG_STRICT_MODULE_RWX)) ret = module_set_memory(mod, type, set_memory_rox); else diff --git a/mm/execmem.c b/mm/execmem.c index 0c4b36bc6d10d..0f6691e9ffe6d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -10,6 +10,7 @@ #include #include #include +#include static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; @@ -69,6 +70,16 @@ void execmem_free(void *ptr) vfree(ptr); } +void *execmem_update_copy(void *dst, const void *src, size_t size) +{ + return text_poke_copy(dst, src, size); +} + +bool execmem_is_rox(enum execmem_type type) +{ + return !!(execmem_info->ranges[type].flags & EXECMEM_ROX_CACHE); +} + static bool execmem_validate(struct execmem_info *info) { struct execmem_range *r = &info->ranges[EXECMEM_DEFAULT]; From 0c6378a71574daa6cd1534ad42a956e3262756c7 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:08 +0300 Subject: [PATCH 131/215] arch: introduce set_direct_map_valid_noflush() Add an API that will allow updates of the direct/linear map for a set of physically contiguous pages. It will be used in the following patches. Link: https://lkml.kernel.org/r/20241023162711.2579610-6-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Christoph Hellwig Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/include/asm/set_memory.h | 1 + arch/arm64/mm/pageattr.c | 10 ++++++++++ arch/loongarch/include/asm/set_memory.h | 1 + arch/loongarch/mm/pageattr.c | 19 +++++++++++++++++++ arch/riscv/include/asm/set_memory.h | 1 + arch/riscv/mm/pageattr.c | 15 +++++++++++++++ arch/s390/include/asm/set_memory.h | 1 + arch/s390/mm/pageattr.c | 11 +++++++++++ arch/x86/include/asm/set_memory.h | 1 + arch/x86/mm/pat/set_memory.c | 8 ++++++++ include/linux/set_memory.h | 6 ++++++ 11 files changed, 74 insertions(+) diff --git a/arch/arm64/include/asm/set_memory.h b/arch/arm64/include/asm/set_memory.h index 917761feeffdd..98088c043606a 100644 --- a/arch/arm64/include/asm/set_memory.h +++ b/arch/arm64/include/asm/set_memory.h @@ -13,6 +13,7 @@ int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); #endif /* _ASM_ARM64_SET_MEMORY_H */ diff --git a/arch/arm64/mm/pageattr.c b/arch/arm64/mm/pageattr.c index 0e270a1c51e64..01225900293ac 100644 --- a/arch/arm64/mm/pageattr.c +++ b/arch/arm64/mm/pageattr.c @@ -192,6 +192,16 @@ int set_direct_map_default_noflush(struct page *page) PAGE_SIZE, change_page_range, &data); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long addr = (unsigned long)page_address(page); + + if (!can_set_direct_map()) + return 0; + + return set_memory_valid(addr, nr, valid); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/arch/loongarch/include/asm/set_memory.h b/arch/loongarch/include/asm/set_memory.h index d70505b6676cb..55dfaefd02c8a 100644 --- a/arch/loongarch/include/asm/set_memory.h +++ b/arch/loongarch/include/asm/set_memory.h @@ -17,5 +17,6 @@ int set_memory_rw(unsigned long addr, int numpages); bool kernel_page_present(struct page *page); int set_direct_map_default_noflush(struct page *page); int set_direct_map_invalid_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); #endif /* _ASM_LOONGARCH_SET_MEMORY_H */ diff --git a/arch/loongarch/mm/pageattr.c b/arch/loongarch/mm/pageattr.c index ffd8d76021d47..bf86782484440 100644 --- a/arch/loongarch/mm/pageattr.c +++ b/arch/loongarch/mm/pageattr.c @@ -216,3 +216,22 @@ int set_direct_map_invalid_noflush(struct page *page) return __set_memory(addr, 1, __pgprot(0), __pgprot(_PAGE_PRESENT | _PAGE_VALID)); } + +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long addr = (unsigned long)page_address(page); + pgprot_t set, clear; + + if (addr < vm_map_base) + return 0; + + if (valid) { + set = PAGE_KERNEL; + clear = __pgprot(0); + } else { + set = __pgprot(0); + clear = __pgprot(_PAGE_PRESENT | _PAGE_VALID); + } + + return __set_memory(addr, 1, set, clear); +} diff --git a/arch/riscv/include/asm/set_memory.h b/arch/riscv/include/asm/set_memory.h index ab92fc84e1fc9..ea263d3683ef6 100644 --- a/arch/riscv/include/asm/set_memory.h +++ b/arch/riscv/include/asm/set_memory.h @@ -42,6 +42,7 @@ static inline int set_kernel_memory(char *startp, char *endp, int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); #endif /* __ASSEMBLY__ */ diff --git a/arch/riscv/mm/pageattr.c b/arch/riscv/mm/pageattr.c index 271d01a5ba4da..d815448758a19 100644 --- a/arch/riscv/mm/pageattr.c +++ b/arch/riscv/mm/pageattr.c @@ -386,6 +386,21 @@ int set_direct_map_default_noflush(struct page *page) PAGE_KERNEL, __pgprot(_PAGE_EXEC)); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + pgprot_t set, clear; + + if (valid) { + set = PAGE_KERNEL; + clear = __pgprot(_PAGE_EXEC); + } else { + set = __pgprot(0); + clear = __pgprot(_PAGE_PRESENT); + } + + return __set_memory((unsigned long)page_address(page), nr, set, clear); +} + #ifdef CONFIG_DEBUG_PAGEALLOC static int debug_pagealloc_set_page(pte_t *pte, unsigned long addr, void *data) { diff --git a/arch/s390/include/asm/set_memory.h b/arch/s390/include/asm/set_memory.h index 06fbabe2f66c9..240bcfbdcdcec 100644 --- a/arch/s390/include/asm/set_memory.h +++ b/arch/s390/include/asm/set_memory.h @@ -62,5 +62,6 @@ __SET_MEMORY_FUNC(set_memory_4k, SET_MEMORY_4K) int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); #endif diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index 5f805ad42d4c3..4c7ee74aa130d 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -406,6 +406,17 @@ int set_direct_map_default_noflush(struct page *page) return __set_memory((unsigned long)page_to_virt(page), 1, SET_MEMORY_DEF); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + unsigned long flags; + + if (valid) + flags = SET_MEMORY_DEF; + else + flags = SET_MEMORY_INV; + + return __set_memory((unsigned long)page_to_virt(page), nr, flags); +} #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) static void ipte_range(pte_t *pte, unsigned long address, int nr) diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 4b2abce2e3e7d..cc62ef70ccc0a 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -89,6 +89,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid); bool kernel_page_present(struct page *page); extern int kernel_set_to_readonly; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 44f7b2ea6a073..069e421c22474 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2444,6 +2444,14 @@ int set_direct_map_default_noflush(struct page *page) return __set_pages_p(page, 1); } +int set_direct_map_valid_noflush(struct page *page, unsigned nr, bool valid) +{ + if (valid) + return __set_pages_p(page, nr); + + return __set_pages_np(page, nr); +} + #ifdef CONFIG_DEBUG_PAGEALLOC void __kernel_map_pages(struct page *page, int numpages, int enable) { diff --git a/include/linux/set_memory.h b/include/linux/set_memory.h index e7aec20fb44f1..3030d9245f5ac 100644 --- a/include/linux/set_memory.h +++ b/include/linux/set_memory.h @@ -34,6 +34,12 @@ static inline int set_direct_map_default_noflush(struct page *page) return 0; } +static inline int set_direct_map_valid_noflush(struct page *page, + unsigned nr, bool valid) +{ + return 0; +} + static inline bool kernel_page_present(struct page *page) { return true; From 9bfc4824fd4836c16bb44f922bfaffba5da3e4f3 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:09 +0300 Subject: [PATCH 132/215] x86/module: prepare module loading for ROX allocations of text When module text memory will be allocated with ROX permissions, the memory at the actual address where the module will live will contain invalid instructions and there will be a writable copy that contains the actual module code. Update relocations and alternatives patching to deal with it. [rppt@kernel.org: fix writable address in cfi_rewrite_endbr()] Link: https://lkml.kernel.org/r/ZysRwR29Ji8CcbXc@kernel.org Link: https://lkml.kernel.org/r/20241023162711.2579610-7-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Tested-by: kdevops Tested-by: Nathan Chancellor Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/um/kernel/um_arch.c | 11 +- arch/x86/entry/vdso/vma.c | 3 +- arch/x86/include/asm/alternative.h | 14 +-- arch/x86/kernel/alternative.c | 181 +++++++++++++++++------------ arch/x86/kernel/ftrace.c | 30 ++--- arch/x86/kernel/module.c | 45 ++++--- 6 files changed, 167 insertions(+), 117 deletions(-) diff --git a/arch/um/kernel/um_arch.c b/arch/um/kernel/um_arch.c index f8de31a0c5d12..e8e8b54b3037d 100644 --- a/arch/um/kernel/um_arch.c +++ b/arch/um/kernel/um_arch.c @@ -435,24 +435,25 @@ void __init arch_cpu_finalize_init(void) os_check_bugs(); } -void apply_seal_endbr(s32 *start, s32 *end) +void apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } -void apply_retpolines(s32 *start, s32 *end) +void apply_retpolines(s32 *start, s32 *end, struct module *mod) { } -void apply_returns(s32 *start, s32 *end) +void apply_returns(s32 *start, s32 *end, struct module *mod) { } void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } -void apply_alternatives(struct alt_instr *start, struct alt_instr *end) +void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod) { } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index b8fed8b8b9ccd..ed21151923c30 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -54,7 +54,8 @@ int __init init_vdso_image(const struct vdso_image *image) apply_alternatives((struct alt_instr *)(image->data + image->alt), (struct alt_instr *)(image->data + image->alt + - image->alt_len)); + image->alt_len), + NULL); return 0; } diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index ca9ae606aab9a..dc03a647776d9 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -96,16 +96,16 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; * instructions were patched in already: */ extern int alternatives_patched; +struct module; extern void alternative_instructions(void); -extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); -extern void apply_retpolines(s32 *start, s32 *end); -extern void apply_returns(s32 *start, s32 *end); -extern void apply_seal_endbr(s32 *start, s32 *end); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end, + struct module *mod); +extern void apply_retpolines(s32 *start, s32 *end, struct module *mod); +extern void apply_returns(s32 *start, s32 *end, struct module *mod); +extern void apply_seal_endbr(s32 *start, s32 *end, struct module *mod); extern void apply_fineibt(s32 *start_retpoline, s32 *end_retpoine, - s32 *start_cfi, s32 *end_cfi); - -struct module; + s32 *start_cfi, s32 *end_cfi, struct module *mod); struct callthunk_sites { s32 *call_start, *call_end; diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d17518ca19b8b..243843e44e89d 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -392,8 +392,10 @@ EXPORT_SYMBOL(BUG_func); * Rewrite the "call BUG_func" replacement to point to the target of the * indirect pv_ops call "call *disp(%ip)". */ -static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) +static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a, + struct module *mod) { + u8 *wr_instr = module_writable_address(mod, instr); void *target, *bug = &BUG_func; s32 disp; @@ -403,14 +405,14 @@ static int alt_replace_call(u8 *instr, u8 *insn_buff, struct alt_instr *a) } if (a->instrlen != 6 || - instr[0] != CALL_RIP_REL_OPCODE || - instr[1] != CALL_RIP_REL_MODRM) { + wr_instr[0] != CALL_RIP_REL_OPCODE || + wr_instr[1] != CALL_RIP_REL_MODRM) { pr_err("ALT_FLAG_DIRECT_CALL set for unrecognized indirect call\n"); BUG(); } /* Skip CALL_RIP_REL_OPCODE and CALL_RIP_REL_MODRM */ - disp = *(s32 *)(instr + 2); + disp = *(s32 *)(wr_instr + 2); #ifdef CONFIG_X86_64 /* ff 15 00 00 00 00 call *0x0(%rip) */ /* target address is stored at "next instruction + disp". */ @@ -448,7 +450,8 @@ static inline u8 * instr_va(struct alt_instr *i) * to refetch changed I$ lines. */ void __init_or_module noinline apply_alternatives(struct alt_instr *start, - struct alt_instr *end) + struct alt_instr *end, + struct module *mod) { u8 insn_buff[MAX_PATCH_LEN]; u8 *instr, *replacement; @@ -477,6 +480,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, */ for (a = start; a < end; a++) { int insn_buff_sz = 0; + u8 *wr_instr, *wr_replacement; /* * In case of nested ALTERNATIVE()s the outer alternative might @@ -490,7 +494,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, } instr = instr_va(a); + wr_instr = module_writable_address(mod, instr); + replacement = (u8 *)&a->repl_offset + a->repl_offset; + wr_replacement = module_writable_address(mod, replacement); + BUG_ON(a->instrlen > sizeof(insn_buff)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); @@ -501,9 +509,9 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, * patch if feature is *NOT* present. */ if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT)) { - memcpy(insn_buff, instr, a->instrlen); + memcpy(insn_buff, wr_instr, a->instrlen); optimize_nops(instr, insn_buff, a->instrlen); - text_poke_early(instr, insn_buff, a->instrlen); + text_poke_early(wr_instr, insn_buff, a->instrlen); continue; } @@ -513,11 +521,12 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, instr, instr, a->instrlen, replacement, a->replacementlen, a->flags); - memcpy(insn_buff, replacement, a->replacementlen); + memcpy(insn_buff, wr_replacement, a->replacementlen); insn_buff_sz = a->replacementlen; if (a->flags & ALT_FLAG_DIRECT_CALL) { - insn_buff_sz = alt_replace_call(instr, insn_buff, a); + insn_buff_sz = alt_replace_call(instr, insn_buff, a, + mod); if (insn_buff_sz < 0) continue; } @@ -527,11 +536,11 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); - DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); + DUMP_BYTES(ALT, wr_instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); DUMP_BYTES(ALT, insn_buff, insn_buff_sz, "%px: final_insn: ", instr); - text_poke_early(instr, insn_buff, insn_buff_sz); + text_poke_early(wr_instr, insn_buff, insn_buff_sz); } kasan_enable_current(); @@ -722,18 +731,20 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) /* * Generated by 'objtool --retpoline'. */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op1, op2; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -761,9 +772,9 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) len = patch_retpoline(addr, &insn, bytes); if (len == insn.length) { optimize_nops(addr, bytes, len); - DUMP_BYTES(RETPOLINE, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RETPOLINE, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RETPOLINE, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } @@ -799,7 +810,8 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) return i; } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { s32 *s; @@ -808,12 +820,13 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); struct insn insn; int len, ret; u8 bytes[16]; u8 op; - ret = insn_decode_kernel(&insn, addr); + ret = insn_decode_kernel(&insn, wr_addr); if (WARN_ON_ONCE(ret < 0)) continue; @@ -833,32 +846,35 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) len = patch_return(addr, &insn, bytes); if (len == insn.length) { - DUMP_BYTES(RET, ((u8*)addr), len, "%px: orig: ", addr); + DUMP_BYTES(RET, ((u8*)wr_addr), len, "%px: orig: ", addr); DUMP_BYTES(RET, ((u8*)bytes), len, "%px: repl: ", addr); - text_poke_early(addr, bytes, len); + text_poke_early(wr_addr, bytes, len); } } } #else -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETHUNK */ #else /* !CONFIG_MITIGATION_RETPOLINE || !CONFIG_OBJTOOL */ -void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { } -void __init_or_module noinline apply_returns(s32 *start, s32 *end) { } +void __init_or_module noinline apply_retpolines(s32 *start, s32 *end, + struct module *mod) { } +void __init_or_module noinline apply_returns(s32 *start, s32 *end, + struct module *mod) { } #endif /* CONFIG_MITIGATION_RETPOLINE && CONFIG_OBJTOOL */ #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr); +static void poison_cfi(void *addr, void *wr_addr); -static void __init_or_module poison_endbr(void *addr, bool warn) +static void __init_or_module poison_endbr(void *addr, void *wr_addr, bool warn) { u32 endbr, poison = gen_endbr_poison(); - if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr))) + if (WARN_ON_ONCE(get_kernel_nofault(endbr, wr_addr))) return; if (!is_endbr(endbr)) { @@ -873,7 +889,7 @@ static void __init_or_module poison_endbr(void *addr, bool warn) */ DUMP_BYTES(ENDBR, ((u8*)addr), 4, "%px: orig: ", addr); DUMP_BYTES(ENDBR, ((u8*)&poison), 4, "%px: repl: ", addr); - text_poke_early(addr, &poison, 4); + text_poke_early(wr_addr, &poison, 4); } /* @@ -882,22 +898,23 @@ static void __init_or_module poison_endbr(void *addr, bool warn) * Seal the functions for indirect calls by clobbering the ENDBR instructions * and the kCFI hash value. */ -void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end) +void __init_or_module noinline apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr, true); + poison_endbr(addr, wr_addr, true); if (IS_ENABLED(CONFIG_FINEIBT)) - poison_cfi(addr - 16); + poison_cfi(addr - 16, wr_addr - 16); } } #else -void __init_or_module apply_seal_endbr(s32 *start, s32 *end) { } +void __init_or_module apply_seal_endbr(s32 *start, s32 *end, struct module *mod) { } #endif /* CONFIG_X86_KERNEL_IBT */ @@ -1119,7 +1136,7 @@ static u32 decode_caller_hash(void *addr) } /* .retpoline_sites */ -static int cfi_disable_callers(s32 *start, s32 *end) +static int cfi_disable_callers(s32 *start, s32 *end, struct module *mod) { /* * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate @@ -1131,20 +1148,23 @@ static int cfi_disable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); + if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, jmp, 2); + text_poke_early(wr_addr, jmp, 2); } return 0; } -static int cfi_enable_callers(s32 *start, s32 *end) +static int cfi_enable_callers(s32 *start, s32 *end, struct module *mod) { /* * Re-enable kCFI, undo what cfi_disable_callers() did. @@ -1154,106 +1174,115 @@ static int cfi_enable_callers(s32 *start, s32 *end) for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (!hash) /* nocfi callers */ continue; - text_poke_early(addr, mov, 2); + text_poke_early(wr_addr, mov, 2); } return 0; } /* .cfi_sites */ -static int cfi_rand_preamble(s32 *start, s32 *end) +static int cfi_rand_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; hash = cfi_rehash(hash); - text_poke_early(addr + 1, &hash, 4); + text_poke_early(wr_addr + 1, &hash, 4); } return 0; } -static int cfi_rewrite_preamble(s32 *start, s32 *end) +static int cfi_rewrite_preamble(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); u32 hash; - hash = decode_preamble_hash(addr); + hash = decode_preamble_hash(wr_addr); if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n", addr, addr, 5, addr)) return -EINVAL; - text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size); - WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678); - text_poke_early(addr + fineibt_preamble_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_preamble_start, fineibt_preamble_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_preamble_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_preamble_hash, &hash, 4); } return 0; } -static void cfi_rewrite_endbr(s32 *start, s32 *end) +static void cfi_rewrite_endbr(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr = module_writable_address(mod, addr); - poison_endbr(addr+16, false); + poison_endbr(addr + 16, wr_addr + 16, false); } } /* .retpoline_sites */ -static int cfi_rand_callers(s32 *start, s32 *end) +static int cfi_rand_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { hash = -cfi_rehash(hash); - text_poke_early(addr + 2, &hash, 4); + text_poke_early(wr_addr + 2, &hash, 4); } } return 0; } -static int cfi_rewrite_callers(s32 *start, s32 *end) +static int cfi_rewrite_callers(s32 *start, s32 *end, struct module *mod) { s32 *s; for (s = start; s < end; s++) { void *addr = (void *)s + *s; + void *wr_addr; u32 hash; addr -= fineibt_caller_size; - hash = decode_caller_hash(addr); + wr_addr = module_writable_address(mod, addr); + hash = decode_caller_hash(wr_addr); if (hash) { - text_poke_early(addr, fineibt_caller_start, fineibt_caller_size); - WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678); - text_poke_early(addr + fineibt_caller_hash, &hash, 4); + text_poke_early(wr_addr, fineibt_caller_start, fineibt_caller_size); + WARN_ON(*(u32 *)(wr_addr + fineibt_caller_hash) != 0x12345678); + text_poke_early(wr_addr + fineibt_caller_hash, &hash, 4); } /* rely on apply_retpolines() */ } @@ -1262,8 +1291,9 @@ static int cfi_rewrite_callers(s32 *start, s32 *end) } static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { + bool builtin = mod ? false : true; int ret; if (WARN_ONCE(fineibt_preamble_size != 16, @@ -1281,7 +1311,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, * rewrite them. This disables all CFI. If this succeeds but any of the * later stages fails, we're without CFI. */ - ret = cfi_disable_callers(start_retpoline, end_retpoline); + ret = cfi_disable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1292,11 +1322,11 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, cfi_bpf_subprog_hash = cfi_rehash(cfi_bpf_subprog_hash); } - ret = cfi_rand_preamble(start_cfi, end_cfi); + ret = cfi_rand_preamble(start_cfi, end_cfi, mod); if (ret) goto err; - ret = cfi_rand_callers(start_retpoline, end_retpoline); + ret = cfi_rand_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; } @@ -1308,7 +1338,7 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, return; case CFI_KCFI: - ret = cfi_enable_callers(start_retpoline, end_retpoline); + ret = cfi_enable_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; @@ -1318,17 +1348,17 @@ static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, case CFI_FINEIBT: /* place the FineIBT preamble at func()-16 */ - ret = cfi_rewrite_preamble(start_cfi, end_cfi); + ret = cfi_rewrite_preamble(start_cfi, end_cfi, mod); if (ret) goto err; /* rewrite the callers to target func()-16 */ - ret = cfi_rewrite_callers(start_retpoline, end_retpoline); + ret = cfi_rewrite_callers(start_retpoline, end_retpoline, mod); if (ret) goto err; /* now that nobody targets func()+0, remove ENDBR there */ - cfi_rewrite_endbr(start_cfi, end_cfi); + cfi_rewrite_endbr(start_cfi, end_cfi, mod); if (builtin) pr_info("Using FineIBT CFI\n"); @@ -1347,7 +1377,7 @@ static inline void poison_hash(void *addr) *(u32 *)addr = 0; } -static void poison_cfi(void *addr) +static void poison_cfi(void *addr, void *wr_addr) { switch (cfi_mode) { case CFI_FINEIBT: @@ -1359,8 +1389,8 @@ static void poison_cfi(void *addr) * ud2 * 1: nop */ - poison_endbr(addr, false); - poison_hash(addr + fineibt_preamble_hash); + poison_endbr(addr, wr_addr, false); + poison_hash(wr_addr + fineibt_preamble_hash); break; case CFI_KCFI: @@ -1369,7 +1399,7 @@ static void poison_cfi(void *addr) * movl $0, %eax * .skip 11, 0x90 */ - poison_hash(addr + 1); + poison_hash(wr_addr + 1); break; default: @@ -1380,22 +1410,21 @@ static void poison_cfi(void *addr) #else static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi, bool builtin) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { } #ifdef CONFIG_X86_KERNEL_IBT -static void poison_cfi(void *addr) { } +static void poison_cfi(void *addr, void *wr_addr) { } #endif #endif void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline, - s32 *start_cfi, s32 *end_cfi) + s32 *start_cfi, s32 *end_cfi, struct module *mod) { return __apply_fineibt(start_retpoline, end_retpoline, - start_cfi, end_cfi, - /* .builtin = */ false); + start_cfi, end_cfi, mod); } #ifdef CONFIG_SMP @@ -1692,16 +1721,16 @@ void __init alternative_instructions(void) paravirt_set_cap(); __apply_fineibt(__retpoline_sites, __retpoline_sites_end, - __cfi_sites, __cfi_sites_end, true); + __cfi_sites, __cfi_sites_end, NULL); /* * Rewrite the retpolines, must be done before alternatives since * those can rewrite the retpoline thunks. */ - apply_retpolines(__retpoline_sites, __retpoline_sites_end); - apply_returns(__return_sites, __return_sites_end); + apply_retpolines(__retpoline_sites, __retpoline_sites_end, NULL); + apply_returns(__return_sites, __return_sites_end, NULL); - apply_alternatives(__alt_instructions, __alt_instructions_end); + apply_alternatives(__alt_instructions, __alt_instructions_end, NULL); /* * Now all calls are established. Apply the call thunks if @@ -1712,7 +1741,7 @@ void __init alternative_instructions(void) /* * Seal all functions that do not have their address taken. */ - apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + apply_seal_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end, NULL); #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 8da0e66ca22de..b498897b213cc 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -118,10 +118,13 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, return ret; /* replace the text with the new text */ - if (ftrace_poke_late) + if (ftrace_poke_late) { text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); - else - text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); + } else { + mutex_lock(&text_mutex); + text_poke((void *)ip, new_code, MCOUNT_INSN_SIZE); + mutex_unlock(&text_mutex); + } return 0; } @@ -318,7 +321,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) unsigned const char op_ref[] = { 0x48, 0x8b, 0x15 }; unsigned const char retq[] = { RET_INSN_OPCODE, INT3_INSN_OPCODE }; union ftrace_op_code_union op_ptr; - int ret; + void *ret; if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { start_offset = (unsigned long)ftrace_regs_caller; @@ -349,15 +352,15 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) npages = DIV_ROUND_UP(*tramp_size, PAGE_SIZE); /* Copy ftrace_caller onto the trampoline memory */ - ret = copy_from_kernel_nofault(trampoline, (void *)start_offset, size); - if (WARN_ON(ret < 0)) + ret = text_poke_copy(trampoline, (void *)start_offset, size); + if (WARN_ON(!ret)) goto fail; ip = trampoline + size; if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else - memcpy(ip, retq, sizeof(retq)); + text_poke_copy(ip, retq, sizeof(retq)); /* No need to test direct calls on created trampolines */ if (ops->flags & FTRACE_OPS_FL_SAVE_REGS) { @@ -365,8 +368,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) ip = trampoline + (jmp_offset - start_offset); if (WARN_ON(*(char *)ip != 0x75)) goto fail; - ret = copy_from_kernel_nofault(ip, x86_nops[2], 2); - if (ret < 0) + if (!text_poke_copy(ip, x86_nops[2], 2)) goto fail; } @@ -379,7 +381,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) */ ptr = (unsigned long *)(trampoline + size + RET_SIZE); - *ptr = (unsigned long)ops; + text_poke_copy(ptr, &ops, sizeof(unsigned long)); op_offset -= start_offset; memcpy(&op_ptr, trampoline + op_offset, OP_REF_SIZE); @@ -395,7 +397,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) op_ptr.offset = offset; /* put in the new offset to the ftrace_ops */ - memcpy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); + text_poke_copy(trampoline + op_offset, &op_ptr, OP_REF_SIZE); /* put in the call to the function */ mutex_lock(&text_mutex); @@ -405,9 +407,9 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) * the depth accounting before the call already. */ dest = ftrace_ops_get_func(ops); - memcpy(trampoline + call_offset, - text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), - CALL_INSN_SIZE); + text_poke_copy_locked(trampoline + call_offset, + text_gen_insn(CALL_INSN_OPCODE, trampoline + call_offset, dest), + CALL_INSN_SIZE, false); mutex_unlock(&text_mutex); /* ALLOC_TRAMP flags lets us know we created it */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 837450b6e882f..8984abd91c001 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -146,18 +146,21 @@ static int __write_relocate_add(Elf64_Shdr *sechdrs, } if (apply) { - if (memcmp(loc, &zero, size)) { + void *wr_loc = module_writable_address(me, loc); + + if (memcmp(wr_loc, &zero, size)) { pr_err("x86/modules: Invalid relocation target, existing value is nonzero for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } - write(loc, &val, size); + write(wr_loc, &val, size); } else { if (memcmp(loc, &val, size)) { pr_warn("x86/modules: Invalid relocation target, existing value does not match expected value for type %d, loc %p, val %Lx\n", (int)ELF64_R_TYPE(rel[i].r_info), loc, val); return -ENOEXEC; } + /* FIXME: needs care for ROX module allocations */ write(loc, &zero, size); } } @@ -224,7 +227,7 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *me) { - const Elf_Shdr *s, *alt = NULL, *locks = NULL, + const Elf_Shdr *s, *alt = NULL, *orc = NULL, *orc_ip = NULL, *retpolines = NULL, *returns = NULL, *ibt_endbr = NULL, *calls = NULL, *cfi = NULL; @@ -233,8 +236,6 @@ int module_finalize(const Elf_Ehdr *hdr, for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { if (!strcmp(".altinstructions", secstrings + s->sh_name)) alt = s; - if (!strcmp(".smp_locks", secstrings + s->sh_name)) - locks = s; if (!strcmp(".orc_unwind", secstrings + s->sh_name)) orc = s; if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) @@ -265,20 +266,20 @@ int module_finalize(const Elf_Ehdr *hdr, csize = cfi->sh_size; } - apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize); + apply_fineibt(rseg, rseg + rsize, cseg, cseg + csize, me); } if (retpolines) { void *rseg = (void *)retpolines->sh_addr; - apply_retpolines(rseg, rseg + retpolines->sh_size); + apply_retpolines(rseg, rseg + retpolines->sh_size, me); } if (returns) { void *rseg = (void *)returns->sh_addr; - apply_returns(rseg, rseg + returns->sh_size); + apply_returns(rseg, rseg + returns->sh_size, me); } if (alt) { /* patch .altinstructions */ void *aseg = (void *)alt->sh_addr; - apply_alternatives(aseg, aseg + alt->sh_size); + apply_alternatives(aseg, aseg + alt->sh_size, me); } if (calls || alt) { struct callthunk_sites cs = {}; @@ -297,8 +298,28 @@ int module_finalize(const Elf_Ehdr *hdr, } if (ibt_endbr) { void *iseg = (void *)ibt_endbr->sh_addr; - apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size); + apply_seal_endbr(iseg, iseg + ibt_endbr->sh_size, me); } + + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + + return 0; +} + +int module_post_finalize(const Elf_Ehdr *hdr, + const Elf_Shdr *sechdrs, + struct module *me) +{ + const Elf_Shdr *s, *locks = NULL; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".smp_locks", secstrings + s->sh_name)) + locks = s; + } + if (locks) { void *lseg = (void *)locks->sh_addr; void *text = me->mem[MOD_TEXT].base; @@ -308,10 +329,6 @@ int module_finalize(const Elf_Ehdr *hdr, text, text_end); } - if (orc && orc_ip) - unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, - (void *)orc->sh_addr, orc->sh_size); - return 0; } From 2e45474ab14f0f17c1091c503a13ff2fe2a84486 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:10 +0300 Subject: [PATCH 133/215] execmem: add support for cache of large ROX pages Using large pages to map text areas reduces iTLB pressure and improves performance. Extend execmem_alloc() with an ability to use huge pages with ROX permissions as a cache for smaller allocations. To populate the cache, a writable large page is allocated from vmalloc with VM_ALLOW_HUGE_VMAP, filled with invalid instructions and then remapped as ROX. The direct map alias of that large page is exculded from the direct map. Portions of that large page are handed out to execmem_alloc() callers without any changes to the permissions. When the memory is freed with execmem_free() it is invalidated again so that it won't contain stale instructions. An architecture has to implement execmem_fill_trapping_insns() callback and select ARCH_HAS_EXECMEM_ROX configuration option to be able to use the ROX cache. The cache is enabled on per-range basis when an architecture sets EXECMEM_ROX_CACHE flag in definition of an execmem_range. Link: https://lkml.kernel.org/r/20241023162711.2579610-8-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/Kconfig | 8 + include/linux/execmem.h | 14 ++ mm/execmem.c | 325 +++++++++++++++++++++++++++++++++++++++- mm/internal.h | 1 + mm/vmalloc.c | 5 + 5 files changed, 345 insertions(+), 8 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index bd9f095d69fa0..89b14e4edc61d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1024,6 +1024,14 @@ config ARCH_WANTS_EXECMEM_LATE enough entropy for module space randomization, for instance arm64. +config ARCH_HAS_EXECMEM_ROX + bool + depends on MMU && !HIGHMEM + help + For architectures that support allocations of executable memory + with read-only execute permissions. Architecture must implement + execmem_fill_trapping_insns() callback to enable this. + config HAVE_IRQ_EXIT_ON_IRQ_STACK bool help diff --git a/include/linux/execmem.h b/include/linux/execmem.h index dfdf19f8a5e88..1517fa196bf73 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -53,6 +53,20 @@ enum execmem_range_flags { EXECMEM_ROX_CACHE = (1 << 1), }; +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +/** + * execmem_fill_trapping_insns - set memory to contain instructions that + * will trap + * @ptr: pointer to memory to fill + * @size: size of the range to fill + * @writable: is the memory poited by @ptr is writable or ROX + * + * A hook for architecures to fill execmem ranges with invalid instructions. + * Architectures that use EXECMEM_ROX_CACHE must implement this. + */ +void execmem_fill_trapping_insns(void *ptr, size_t size, bool writable); +#endif + /** * struct execmem_range - definition of an address space suitable for code and * related data allocations diff --git a/mm/execmem.c b/mm/execmem.c index 0f6691e9ffe6d..576a57e2161f9 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -6,29 +6,41 @@ * Copyright (C) 2024 Mike Rapoport IBM. */ +#define pr_fmt(fmt) "execmem: " fmt + #include +#include #include #include +#include +#include #include #include +#include + +#include "internal.h" + static struct execmem_info *execmem_info __ro_after_init; static struct execmem_info default_execmem_info __ro_after_init; -static void *__execmem_alloc(struct execmem_range *range, size_t size) +#ifdef CONFIG_MMU +static void *execmem_vmalloc(struct execmem_range *range, size_t size, + pgprot_t pgprot, unsigned long vm_flags) { bool kasan = range->flags & EXECMEM_KASAN_SHADOW; - unsigned long vm_flags = VM_FLUSH_RESET_PERMS; gfp_t gfp_flags = GFP_KERNEL | __GFP_NOWARN; + unsigned int align = range->alignment; unsigned long start = range->start; unsigned long end = range->end; - unsigned int align = range->alignment; - pgprot_t pgprot = range->pgprot; void *p; if (kasan) vm_flags |= VM_DEFER_KMEMLEAK; + if (vm_flags & VM_ALLOW_HUGE_VMAP) + align = PMD_SIZE; + p = __vmalloc_node_range(size, align, start, end, gfp_flags, pgprot, vm_flags, NUMA_NO_NODE, __builtin_return_address(0)); @@ -41,7 +53,7 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size) } if (!p) { - pr_warn_ratelimited("execmem: unable to allocate memory\n"); + pr_warn_ratelimited("unable to allocate memory\n"); return NULL; } @@ -50,14 +62,298 @@ static void *__execmem_alloc(struct execmem_range *range, size_t size) return NULL; } - return kasan_reset_tag(p); + return p; } +#else +static void *execmem_vmalloc(struct execmem_range *range, size_t size, + pgprot_t pgprot, unsigned long vm_flags) +{ + return vmalloc(size); +} +#endif /* CONFIG_MMU */ + +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +struct execmem_cache { + struct mutex mutex; + struct maple_tree busy_areas; + struct maple_tree free_areas; +}; + +static struct execmem_cache execmem_cache = { + .mutex = __MUTEX_INITIALIZER(execmem_cache.mutex), + .busy_areas = MTREE_INIT_EXT(busy_areas, MT_FLAGS_LOCK_EXTERN, + execmem_cache.mutex), + .free_areas = MTREE_INIT_EXT(free_areas, MT_FLAGS_LOCK_EXTERN, + execmem_cache.mutex), +}; + +static inline unsigned long mas_range_len(struct ma_state *mas) +{ + return mas->last - mas->index + 1; +} + +static int execmem_set_direct_map_valid(struct vm_struct *vm, bool valid) +{ + unsigned int nr = (1 << get_vm_area_page_order(vm)); + unsigned int updated = 0; + int err = 0; + + for (int i = 0; i < vm->nr_pages; i += nr) { + err = set_direct_map_valid_noflush(vm->pages[i], nr, valid); + if (err) + goto err_restore; + updated += nr; + } + + return 0; + +err_restore: + for (int i = 0; i < updated; i += nr) + set_direct_map_valid_noflush(vm->pages[i], nr, !valid); + + return err; +} + +static void execmem_cache_clean(struct work_struct *work) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct mutex *mutex = &execmem_cache.mutex; + MA_STATE(mas, free_areas, 0, ULONG_MAX); + void *area; + + mutex_lock(mutex); + mas_for_each(&mas, area, ULONG_MAX) { + size_t size = mas_range_len(&mas); + + if (IS_ALIGNED(size, PMD_SIZE) && + IS_ALIGNED(mas.index, PMD_SIZE)) { + struct vm_struct *vm = find_vm_area(area); + + execmem_set_direct_map_valid(vm, true); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + vfree(area); + } + } + mutex_unlock(mutex); +} + +static DECLARE_WORK(execmem_cache_clean_work, execmem_cache_clean); + +static int execmem_cache_add(void *ptr, size_t size) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, free_areas, addr - 1, addr + 1); + unsigned long lower, upper; + void *area = NULL; + int err; + + lower = addr; + upper = addr + size - 1; + + mutex_lock(mutex); + area = mas_walk(&mas); + if (area && mas.last == addr - 1) + lower = mas.index; + + area = mas_next(&mas, ULONG_MAX); + if (area && mas.index == addr + size) + upper = mas.last; + + mas_set_range(&mas, lower, upper); + err = mas_store_gfp(&mas, (void *)lower, GFP_KERNEL); + mutex_unlock(mutex); + if (err) + return err; + + return 0; +} + +static bool within_range(struct execmem_range *range, struct ma_state *mas, + size_t size) +{ + unsigned long addr = mas->index; + + if (addr >= range->start && addr + size < range->end) + return true; + + if (range->fallback_start && + addr >= range->fallback_start && addr + size < range->fallback_end) + return true; + + return false; +} + +static void *__execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + struct maple_tree *free_areas = &execmem_cache.free_areas; + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + MA_STATE(mas_free, free_areas, 0, ULONG_MAX); + MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr, last, area_size = 0; + void *area, *ptr = NULL; + int err; + + mutex_lock(mutex); + mas_for_each(&mas_free, area, ULONG_MAX) { + area_size = mas_range_len(&mas_free); + + if (area_size >= size && within_range(range, &mas_free, size)) + break; + } + + if (area_size < size) + goto out_unlock; + + addr = mas_free.index; + last = mas_free.last; + + /* insert allocated size to busy_areas at range [addr, addr + size) */ + mas_set_range(&mas_busy, addr, addr + size - 1); + err = mas_store_gfp(&mas_busy, (void *)addr, GFP_KERNEL); + if (err) + goto out_unlock; + + mas_store_gfp(&mas_free, NULL, GFP_KERNEL); + if (area_size > size) { + void *ptr = (void *)(addr + size); + + /* + * re-insert remaining free size to free_areas at range + * [addr + size, last] + */ + mas_set_range(&mas_free, addr + size, last); + err = mas_store_gfp(&mas_free, ptr, GFP_KERNEL); + if (err) { + mas_store_gfp(&mas_busy, NULL, GFP_KERNEL); + goto out_unlock; + } + } + ptr = (void *)addr; + +out_unlock: + mutex_unlock(mutex); + return ptr; +} + +static int execmem_cache_populate(struct execmem_range *range, size_t size) +{ + unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; + unsigned long start, end; + struct vm_struct *vm; + size_t alloc_size; + int err = -ENOMEM; + void *p; + + alloc_size = round_up(size, PMD_SIZE); + p = execmem_vmalloc(range, alloc_size, PAGE_KERNEL, vm_flags); + if (!p) + return err; + + vm = find_vm_area(p); + if (!vm) + goto err_free_mem; + + /* fill memory with instructions that will trap */ + execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); + + start = (unsigned long)p; + end = start + alloc_size; + + vunmap_range(start, end); + + err = execmem_set_direct_map_valid(vm, false); + if (err) + goto err_free_mem; + + err = vmap_pages_range_noflush(start, end, range->pgprot, vm->pages, + PMD_SHIFT); + if (err) + goto err_free_mem; + + err = execmem_cache_add(p, alloc_size); + if (err) + goto err_free_mem; + + return 0; + +err_free_mem: + vfree(p); + return err; +} + +static void *execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + void *p; + int err; + + p = __execmem_cache_alloc(range, size); + if (p) + return p; + + err = execmem_cache_populate(range, size); + if (err) + return NULL; + + return __execmem_cache_alloc(range, size); +} + +static bool execmem_cache_free(void *ptr) +{ + struct maple_tree *busy_areas = &execmem_cache.busy_areas; + struct mutex *mutex = &execmem_cache.mutex; + unsigned long addr = (unsigned long)ptr; + MA_STATE(mas, busy_areas, addr, addr); + size_t size; + void *area; + + mutex_lock(mutex); + area = mas_walk(&mas); + if (!area) { + mutex_unlock(mutex); + return false; + } + size = mas_range_len(&mas); + + mas_store_gfp(&mas, NULL, GFP_KERNEL); + mutex_unlock(mutex); + + execmem_fill_trapping_insns(ptr, size, /* writable = */ false); + + execmem_cache_add(ptr, size); + + schedule_work(&execmem_cache_clean_work); + + return true; +} +#else /* CONFIG_ARCH_HAS_EXECMEM_ROX */ +static void *execmem_cache_alloc(struct execmem_range *range, size_t size) +{ + return NULL; +} + +static bool execmem_cache_free(void *ptr) +{ + return false; +} +#endif /* CONFIG_ARCH_HAS_EXECMEM_ROX */ void *execmem_alloc(enum execmem_type type, size_t size) { struct execmem_range *range = &execmem_info->ranges[type]; + bool use_cache = range->flags & EXECMEM_ROX_CACHE; + unsigned long vm_flags = VM_FLUSH_RESET_PERMS; + pgprot_t pgprot = range->pgprot; + void *p; - return __execmem_alloc(range, size); + if (use_cache) + p = execmem_cache_alloc(range, size); + else + p = execmem_vmalloc(range, size, pgprot, vm_flags); + + return kasan_reset_tag(p); } void execmem_free(void *ptr) @@ -67,7 +363,9 @@ void execmem_free(void *ptr) * supported by vmalloc. */ WARN_ON(in_interrupt()); - vfree(ptr); + + if (!execmem_cache_free(ptr)) + vfree(ptr); } void *execmem_update_copy(void *dst, const void *src, size_t size) @@ -89,6 +387,17 @@ static bool execmem_validate(struct execmem_info *info) return false; } + if (!IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { + for (int i = EXECMEM_DEFAULT; i < EXECMEM_TYPE_MAX; i++) { + r = &info->ranges[i]; + + if (r->flags & EXECMEM_ROX_CACHE) { + pr_warn_once("ROX cache is not supported\n"); + r->flags &= ~EXECMEM_ROX_CACHE; + } + } + } + return true; } diff --git a/mm/internal.h b/mm/internal.h index c743c2b21dbac..3dc745ba76dd4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1234,6 +1234,7 @@ size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, void __init vmalloc_init(void); int __must_check vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift); +unsigned int get_vm_area_page_order(struct vm_struct *vm); #else static inline void vmalloc_init(void) { diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5c0ea4e2b17d7..74c0a5eae2102 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -3023,6 +3023,11 @@ static inline unsigned int vm_area_page_order(struct vm_struct *vm) #endif } +unsigned int get_vm_area_page_order(struct vm_struct *vm) +{ + return vm_area_page_order(vm); +} + static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order) { #ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC From 5185e7f9f3bd754ab60680814afd714e2673ef88 Mon Sep 17 00:00:00 2001 From: "Mike Rapoport (Microsoft)" Date: Wed, 23 Oct 2024 19:27:11 +0300 Subject: [PATCH 134/215] x86/module: enable ROX caches for module text on 64 bit Enable execmem's cache of PMD_SIZE'ed pages mapped as ROX for module text allocations on 64 bit. Link: https://lkml.kernel.org/r/20241023162711.2579610-9-rppt@kernel.org Signed-off-by: Mike Rapoport (Microsoft) Reviewed-by: Luis Chamberlain Tested-by: kdevops Cc: Andreas Larsson Cc: Andy Lutomirski Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Brian Cain Cc: Catalin Marinas Cc: Christophe Leroy Cc: Christoph Hellwig Cc: Dave Hansen Cc: Dinh Nguyen Cc: Geert Uytterhoeven Cc: Guo Ren Cc: Helge Deller Cc: Huacai Chen Cc: Ingo Molnar Cc: Johannes Berg Cc: John Paul Adrian Glaubitz Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Mark Rutland Cc: Masami Hiramatsu (Google) Cc: Matt Turner Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Oleg Nesterov Cc: Palmer Dabbelt Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Russell King Cc: Song Liu Cc: Stafford Horne Cc: Steven Rostedt (Google) Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Thomas Gleixner Cc: Uladzislau Rezki (Sony) Cc: Vineet Gupta Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + arch/x86/mm/init.c | 37 ++++++++++++++++++++++++++++++++++++- 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 16354dfa6d965..585bf042a6a29 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -83,6 +83,7 @@ config X86 select ARCH_HAS_DMA_OPS if GART_IOMMU || XEN select ARCH_HAS_EARLY_DEBUG if KGDB select ARCH_HAS_ELF_RANDOMIZE + select ARCH_HAS_EXECMEM_ROX if X86_64 select ARCH_HAS_FAST_MULTIPLIER select ARCH_HAS_FORTIFY_SOURCE select ARCH_HAS_GCOV_PROFILE_ALL diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index eb503f53c3195..c2e4f389f47fe 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1053,18 +1053,53 @@ unsigned long arch_max_swapfile_size(void) #ifdef CONFIG_EXECMEM static struct execmem_info execmem_info __ro_after_init; +#ifdef CONFIG_ARCH_HAS_EXECMEM_ROX +void execmem_fill_trapping_insns(void *ptr, size_t size, bool writeable) +{ + /* fill memory with INT3 instructions */ + if (writeable) + memset(ptr, INT3_INSN_OPCODE, size); + else + text_poke_set(ptr, INT3_INSN_OPCODE, size); +} +#endif + struct execmem_info __init *execmem_arch_setup(void) { unsigned long start, offset = 0; + enum execmem_range_flags flags; + pgprot_t pgprot; if (kaslr_enabled()) offset = get_random_u32_inclusive(1, 1024) * PAGE_SIZE; start = MODULES_VADDR + offset; + if (IS_ENABLED(CONFIG_ARCH_HAS_EXECMEM_ROX)) { + pgprot = PAGE_KERNEL_ROX; + flags = EXECMEM_KASAN_SHADOW | EXECMEM_ROX_CACHE; + } else { + pgprot = PAGE_KERNEL; + flags = EXECMEM_KASAN_SHADOW; + } + execmem_info = (struct execmem_info){ .ranges = { - [EXECMEM_DEFAULT] = { + [EXECMEM_MODULE_TEXT] = { + .flags = flags, + .start = start, + .end = MODULES_END, + .pgprot = pgprot, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_KPROBES ... EXECMEM_BPF] = { + .flags = EXECMEM_KASAN_SHADOW, + .start = start, + .end = MODULES_END, + .pgprot = PAGE_KERNEL, + .alignment = MODULE_ALIGN, + }, + [EXECMEM_MODULE_DATA] = { .flags = EXECMEM_KASAN_SHADOW, .start = start, .end = MODULES_END, From 7c8c76e446ca0079692fad44a3993cb1d7666c21 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:54 -0700 Subject: [PATCH 135/215] maple_tree: add mas_for_each_rev() helper Patch series "page allocation tag compression", v4. This patchset implements several improvements: 1. Gracefully handles module unloading while there are used allocations allocated from that module; 2. Provides an option to store page allocation tag references in the page flags, removing dependency on page extensions and eliminating the memory overhead from storing page allocation references (~0.2% of total system memory). This also improves page allocation performance when CONFIG_MEM_ALLOC_PROFILING is enabled by eliminating page extension lookup. Page allocation performance overhead is reduced from 41% to 5.5%. Patch #1 introduces mas_for_each_rev() helper function. Patch #2 introduces shutdown_mem_profiling() helper function to be used when disabling memory allocation profiling. Patch #3 copies module tags into virtually contiguous memory which serves two purposes: - Lets us deal with the situation when module is unloaded while there are still live allocations from that module. Since we are using a copy version of the tags we can safely unload the module. Space and gaps in this contiguous memory are managed using a maple tree. - Enables simple indexing of the tags in the later patches. Patch #4 changes the way we allocate virtually contiguous memory for module tags to reserve only vitrual area and populate physical pages only as needed at module load time. Patch #5 abstracts page allocation tag reference to simplify later changes. Patch #6 adds compression option to the sysctl.vm.mem_profiling boot parameter for storing page allocation tag references inside page flags if they fit. If the number of available page flag bits is insufficient to address all kernel allocations, memory allocation profiling gets disabled with an appropriate warning. This patch (of 6): Add mas_for_each_rev() function to iterate maple tree nodes in reverse order. Link: https://lkml.kernel.org/r/20241023170759.999909-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Liam R. Howlett Reviewed-by: Liam R. Howlett Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/maple_tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/maple_tree.h b/include/linux/maple_tree.h index 61c236850ca86..cbbcd18d41868 100644 --- a/include/linux/maple_tree.h +++ b/include/linux/maple_tree.h @@ -592,6 +592,20 @@ static __always_inline void mas_reset(struct ma_state *mas) #define mas_for_each(__mas, __entry, __max) \ while (((__entry) = mas_find((__mas), (__max))) != NULL) +/** + * mas_for_each_rev() - Iterate over a range of the maple tree in reverse order. + * @__mas: Maple Tree operation state (maple_state) + * @__entry: Entry retrieved from the tree + * @__min: minimum index to retrieve from the tree + * + * When returned, mas->index and mas->last will hold the entire range for the + * entry. + * + * Note: may return the zero entry. + */ +#define mas_for_each_rev(__mas, __entry, __min) \ + while (((__entry) = mas_find_rev((__mas), (__min))) != NULL) + #ifdef CONFIG_DEBUG_MAPLE_TREE enum mt_dump_format { mt_dump_dec, From 3e09c500bb5b0606282d04438404f67df132835a Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:55 -0700 Subject: [PATCH 136/215] alloc_tag: introduce shutdown_mem_profiling helper function Implement a helper function to disable memory allocation profiling and use it when creation of /proc/allocinfo fails. Ensure /proc/allocinfo does not get created when memory allocation profiling is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-3-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- lib/alloc_tag.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 81e5f9a70f220..435aa837e5502 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,6 +8,14 @@ #include #include +#define ALLOCINFO_FILE_NAME "allocinfo" + +#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT +static bool mem_profiling_support __meminitdata = true; +#else +static bool mem_profiling_support __meminitdata; +#endif + static struct codetag_type *alloc_tag_cttype; DEFINE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); @@ -144,9 +152,26 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +static void __init shutdown_mem_profiling(void) +{ + if (mem_alloc_profiling_enabled()) + static_branch_disable(&mem_alloc_profiling_key); + + if (!mem_profiling_support) + return; + + mem_profiling_support = false; +} + static void __init procfs_init(void) { - proc_create_seq("allocinfo", 0400, NULL, &allocinfo_seq_op); + if (!mem_profiling_support) + return; + + if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { + pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); + shutdown_mem_profiling(); + } } static bool alloc_tag_module_unload(struct codetag_type *cttype, @@ -174,12 +199,6 @@ static bool alloc_tag_module_unload(struct codetag_type *cttype, return module_unused; } -#ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; -#else -static bool mem_profiling_support __meminitdata; -#endif - static int __init setup_early_mem_profiling(char *str) { bool enable; From 0db6f8d7820a4b788565dac8eed52bfc2c3216da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:56 -0700 Subject: [PATCH 137/215] alloc_tag: load module tags into separate contiguous memory When a module gets unloaded there is a possibility that some of the allocations it made are still used and therefore the allocation tags corresponding to these allocations are still referenced. As such, the memory for these tags can't be freed. This is currently handled as an abnormal situation and module's data section is not being unloaded. To handle this situation without keeping module's data in memory, allow codetags with longer lifespan than the module to be loaded into their own separate memory. The in-use memory areas and gaps after module unloading in this separate memory are tracked using maple trees. Allocation tags arrange their separate memory so that it is virtually contiguous and that will allow simple allocation tag indexing later on in this patchset. The size of this virtually contiguous memory is set to store up to 100000 allocation tags. [surenb@google.com: fix empty codetag module section handling] Link: https://lkml.kernel.org/r/20241101000017.3856204-1-surenb@google.com [akpm@linux-foundation.org: update comment, per Dan] Link: https://lkml.kernel.org/r/20241023170759.999909-4-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/asm-generic/codetag.lds.h | 19 +++ include/linux/alloc_tag.h | 13 +- include/linux/codetag.h | 37 ++++- kernel/module/main.c | 84 ++++++---- lib/alloc_tag.c | 249 +++++++++++++++++++++++++++--- lib/codetag.c | 100 +++++++++++- scripts/module.lds.S | 5 +- 7 files changed, 445 insertions(+), 62 deletions(-) diff --git a/include/asm-generic/codetag.lds.h b/include/asm-generic/codetag.lds.h index 64f536b803802..372c320c50437 100644 --- a/include/asm-generic/codetag.lds.h +++ b/include/asm-generic/codetag.lds.h @@ -11,4 +11,23 @@ #define CODETAG_SECTIONS() \ SECTION_WITH_BOUNDARIES(alloc_tags) +/* + * Module codetags which aren't used after module unload, therefore have the + * same lifespan as the module and can be safely unloaded with the module. + */ +#define MOD_CODETAG_SECTIONS() + +#define MOD_SEPARATE_CODETAG_SECTION(_name) \ + .codetag.##_name : { \ + SECTION_WITH_BOUNDARIES(_name) \ + } + +/* + * For codetags which might be used after module unload, therefore might stay + * longer in memory. Each such codetag type has its own section so that we can + * unload them individually once unused. + */ +#define MOD_SEPARATE_CODETAG_SECTIONS() \ + MOD_SEPARATE_CODETAG_SECTION(alloc_tags) + #endif /* __ASM_GENERIC_CODETAG_LDS_H */ diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 941deffc590df..55d30543c4c7d 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,6 +30,13 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_module_section { + unsigned long start_addr; + unsigned long end_addr; + /* used size */ + unsigned long size; +}; + #ifdef CONFIG_MEM_ALLOC_PROFILING_DEBUG #define CODETAG_EMPTY ((void *)1) @@ -54,6 +61,8 @@ static inline void set_codetag_empty(union codetag_ref *ref) {} #ifdef CONFIG_MEM_ALLOC_PROFILING +#define ALLOC_TAG_SECTION_NAME "alloc_tags" + struct codetag_bytes { struct codetag *ct; s64 bytes; @@ -76,7 +85,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_shared_alloc_tag }; @@ -85,7 +94,7 @@ DECLARE_PER_CPU(struct alloc_tag_counters, _shared_alloc_tag); #define DEFINE_ALLOC_TAG(_alloc_tag) \ static DEFINE_PER_CPU(struct alloc_tag_counters, _alloc_tag_cntr); \ static struct alloc_tag _alloc_tag __used __aligned(8) \ - __section("alloc_tags") = { \ + __section(ALLOC_TAG_SECTION_NAME) = { \ .ct = CODE_TAG_INIT, \ .counters = &_alloc_tag_cntr }; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index c2a579ccd4558..d10bd9810d321 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -35,8 +35,15 @@ struct codetag_type_desc { size_t tag_size; void (*module_load)(struct codetag_type *cttype, struct codetag_module *cmod); - bool (*module_unload)(struct codetag_type *cttype, + void (*module_unload)(struct codetag_type *cttype, struct codetag_module *cmod); +#ifdef CONFIG_MODULES + void (*module_replaced)(struct module *mod, struct module *new_mod); + bool (*needs_section_mem)(struct module *mod, unsigned long size); + void *(*alloc_section_mem)(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align); + void (*free_section_mem)(struct module *mod, bool used); +#endif }; struct codetag_iterator { @@ -71,11 +78,31 @@ struct codetag_type * codetag_register_type(const struct codetag_type_desc *desc); #if defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) + +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size); +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align); +void codetag_free_module_sections(struct module *mod); +void codetag_module_replaced(struct module *mod, struct module *new_mod); void codetag_load_module(struct module *mod); -bool codetag_unload_module(struct module *mod); -#else +void codetag_unload_module(struct module *mod); + +#else /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ + +static inline bool +codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) { return false; } +static inline void * +codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) { return NULL; } +static inline void codetag_free_module_sections(struct module *mod) {} +static inline void codetag_module_replaced(struct module *mod, struct module *new_mod) {} static inline void codetag_load_module(struct module *mod) {} -static inline bool codetag_unload_module(struct module *mod) { return true; } -#endif +static inline void codetag_unload_module(struct module *mod) {} + +#endif /* defined(CONFIG_CODE_TAGGING) && defined(CONFIG_MODULES) */ #endif /* _LINUX_CODETAG_H */ diff --git a/kernel/module/main.c b/kernel/module/main.c index 73b588fe98d44..00c16f5c55683 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -1251,22 +1251,17 @@ static int module_memory_alloc(struct module *mod, enum mod_mem_type type) return 0; } -static void module_memory_free(struct module *mod, enum mod_mem_type type, - bool unload_codetags) +static void module_memory_free(struct module *mod, enum mod_mem_type type) { struct module_memory *mem = &mod->mem[type]; - void *ptr = mem->base; if (mem->is_rox) vfree(mem->rw_copy); - if (!unload_codetags && mod_mem_type_is_core_data(type)) - return; - - execmem_free(ptr); + execmem_free(mem->base); } -static void free_mod_mem(struct module *mod, bool unload_codetags) +static void free_mod_mem(struct module *mod) { for_each_mod_mem_type(type) { struct module_memory *mod_mem = &mod->mem[type]; @@ -1277,25 +1272,20 @@ static void free_mod_mem(struct module *mod, bool unload_codetags) /* Free lock-classes; relies on the preceding sync_rcu(). */ lockdep_free_key_range(mod_mem->base, mod_mem->size); if (mod_mem->size) - module_memory_free(mod, type, unload_codetags); + module_memory_free(mod, type); } /* MOD_DATA hosts mod, so free it at last */ lockdep_free_key_range(mod->mem[MOD_DATA].base, mod->mem[MOD_DATA].size); - module_memory_free(mod, MOD_DATA, unload_codetags); + module_memory_free(mod, MOD_DATA); } /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { - bool unload_codetags; - trace_module_free(mod); - unload_codetags = codetag_unload_module(mod); - if (!unload_codetags) - pr_warn("%s: memory allocation(s) from the module still alive, cannot unload cleanly\n", - mod->name); + codetag_unload_module(mod); mod_sysfs_teardown(mod); @@ -1338,7 +1328,7 @@ static void free_module(struct module *mod) kfree(mod->args); percpu_modfree(mod); - free_mod_mem(mod, unload_codetags); + free_mod_mem(mod); } void *__symbol_get(const char *symbol) @@ -1603,6 +1593,20 @@ static void __layout_sections(struct module *mod, struct load_info *info, bool i if (WARN_ON_ONCE(type == MOD_INVALID)) continue; + /* + * Do not allocate codetag memory as we load it into + * preallocated contiguous memory. + */ + if (codetag_needs_module_section(mod, sname, s->sh_size)) { + /* + * s->sh_entsize won't be used but populate the + * type field to avoid confusion. + */ + s->sh_entsize = ((unsigned long)(type) & SH_ENTSIZE_TYPE_MASK) + << SH_ENTSIZE_TYPE_SHIFT; + continue; + } + s->sh_entsize = module_get_offset_and_type(mod, type, s, i); pr_debug("\t%s\n", sname); } @@ -2277,6 +2281,7 @@ static int move_module(struct module *mod, struct load_info *info) int i; enum mod_mem_type t = 0; int ret = -ENOMEM; + bool codetag_section_found = false; for_each_mod_mem_type(type) { if (!mod->mem[type].size) { @@ -2288,7 +2293,7 @@ static int move_module(struct module *mod, struct load_info *info) ret = module_memory_alloc(mod, type); if (ret) { t = type; - goto out_enomem; + goto out_err; } } @@ -2297,15 +2302,37 @@ static int move_module(struct module *mod, struct load_info *info) for (i = 0; i < info->hdr->e_shnum; i++) { void *dest; Elf_Shdr *shdr = &info->sechdrs[i]; - enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; - unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + const char *sname; unsigned long addr; if (!(shdr->sh_flags & SHF_ALLOC)) continue; - addr = (unsigned long)mod->mem[type].base + offset; - dest = mod->mem[type].rw_copy + offset; + sname = info->secstrings + shdr->sh_name; + /* + * Load codetag sections separately as they might still be used + * after module unload. + */ + if (codetag_needs_module_section(mod, sname, shdr->sh_size)) { + dest = codetag_alloc_module_section(mod, sname, shdr->sh_size, + arch_mod_section_prepend(mod, i), shdr->sh_addralign); + if (WARN_ON(!dest)) { + ret = -EINVAL; + goto out_err; + } + if (IS_ERR(dest)) { + ret = PTR_ERR(dest); + goto out_err; + } + addr = (unsigned long)dest; + codetag_section_found = true; + } else { + enum mod_mem_type type = shdr->sh_entsize >> SH_ENTSIZE_TYPE_SHIFT; + unsigned long offset = shdr->sh_entsize & SH_ENTSIZE_OFFSET_MASK; + + addr = (unsigned long)mod->mem[type].base + offset; + dest = mod->mem[type].rw_copy + offset; + } if (shdr->sh_type != SHT_NOBITS) { /* @@ -2317,7 +2344,7 @@ static int move_module(struct module *mod, struct load_info *info) if (i == info->index.mod && (WARN_ON_ONCE(shdr->sh_size != sizeof(struct module)))) { ret = -ENOEXEC; - goto out_enomem; + goto out_err; } memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); } @@ -2333,9 +2360,12 @@ static int move_module(struct module *mod, struct load_info *info) } return 0; -out_enomem: +out_err: for (t--; t >= 0; t--) - module_memory_free(mod, t, true); + module_memory_free(mod, t); + if (codetag_section_found) + codetag_free_module_sections(mod); + return ret; } @@ -2456,6 +2486,8 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) /* Module has been copied to its final place now: return it. */ mod = (void *)info->sechdrs[info->index.mod].sh_addr; kmemleak_load_module(mod, info); + codetag_module_replaced(info->mod, mod); + return mod; } @@ -2465,7 +2497,7 @@ static void module_deallocate(struct module *mod, struct load_info *info) percpu_modfree(mod); module_arch_freeing_init(mod); - free_mod_mem(mod, true); + free_mod_mem(mod); } int __weak module_finalize(const Elf_Ehdr *hdr, diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 435aa837e5502..5f9cd1642d58e 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only #include +#include #include #include #include @@ -9,6 +10,7 @@ #include #define ALLOCINFO_FILE_NAME "allocinfo" +#define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support __meminitdata = true; @@ -174,31 +176,226 @@ static void __init procfs_init(void) } } -static bool alloc_tag_module_unload(struct codetag_type *cttype, - struct codetag_module *cmod) +#ifdef CONFIG_MODULES + +static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +/* A dummy object used to indicate an unloaded module */ +static struct module unloaded_mod; +/* A dummy object used to indicate a module prepended area */ +static struct module prepend_mod; + +static struct alloc_tag_module_section module_tags; + +static bool needs_section_mem(struct module *mod, unsigned long size) { - struct codetag_iterator iter = codetag_get_ct_iter(cttype); - struct alloc_tag_counters counter; - bool module_unused = true; - struct alloc_tag *tag; - struct codetag *ct; + return size >= sizeof(struct alloc_tag); +} + +static struct alloc_tag *find_used_tag(struct alloc_tag *from, struct alloc_tag *to) +{ + while (from <= to) { + struct alloc_tag_counters counter; - for (ct = codetag_next_ct(&iter); ct; ct = codetag_next_ct(&iter)) { - if (iter.cmod != cmod) + counter = alloc_tag_read(from); + if (counter.bytes) + return from; + from++; + } + + return NULL; +} + +/* Called with mod_area_mt locked */ +static void clean_unused_module_areas_locked(void) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_for_each(&mas, val, module_tags.size) { + if (val != &unloaded_mod) continue; - tag = ct_to_alloc_tag(ct); - counter = alloc_tag_read(tag); + /* Release area if all tags are unused */ + if (!find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last))) + mas_erase(&mas); + } +} + +/* Called with mod_area_mt locked */ +static bool find_aligned_area(struct ma_state *mas, unsigned long section_size, + unsigned long size, unsigned int prepend, unsigned long align) +{ + bool cleanup_done = false; + +repeat: + /* Try finding exact size and hope the start is aligned */ + if (!mas_empty_area(mas, 0, section_size - 1, prepend + size)) { + if (IS_ALIGNED(mas->index + prepend, align)) + return true; + + /* Try finding larger area to align later */ + mas_reset(mas); + if (!mas_empty_area(mas, 0, section_size - 1, + size + prepend + align - 1)) + return true; + } + + /* No free area, try cleanup stale data and repeat the search once */ + if (!cleanup_done) { + clean_unused_module_areas_locked(); + cleanup_done = true; + mas_reset(mas); + goto repeat; + } + + return false; +} + +static void *reserve_module_tags(struct module *mod, unsigned long size, + unsigned int prepend, unsigned long align) +{ + unsigned long section_size = module_tags.end_addr - module_tags.start_addr; + MA_STATE(mas, &mod_area_mt, 0, section_size - 1); + unsigned long offset; + void *ret = NULL; + + /* If no tags return error */ + if (size < sizeof(struct alloc_tag)) + return ERR_PTR(-EINVAL); + + /* + * align is always power of 2, so we can use IS_ALIGNED and ALIGN. + * align 0 or 1 means no alignment, to simplify set to 1. + */ + if (!align) + align = 1; + + mas_lock(&mas); + if (!find_aligned_area(&mas, section_size, size, prepend, align)) { + ret = ERR_PTR(-ENOMEM); + goto unlock; + } + + /* Mark found area as reserved */ + offset = mas.index; + offset += prepend; + offset = ALIGN(offset, align); + if (offset != mas.index) { + unsigned long pad_start = mas.index; + + mas.last = offset - 1; + mas_store(&mas, &prepend_mod); + if (mas_is_err(&mas)) { + ret = ERR_PTR(xa_err(mas.node)); + goto unlock; + } + mas.index = offset; + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) { + mas.index = pad_start; + mas_erase(&mas); + ret = ERR_PTR(xa_err(mas.node)); + } + } else { + mas.last = offset + size - 1; + mas_store(&mas, mod); + if (mas_is_err(&mas)) + ret = ERR_PTR(xa_err(mas.node)); + } +unlock: + mas_unlock(&mas); + + if (IS_ERR(ret)) + return ret; - if (WARN(counter.bytes, - "%s:%u module %s func:%s has %llu allocated at module unload", - ct->filename, ct->lineno, ct->modname, ct->function, counter.bytes)) - module_unused = false; + if (module_tags.size < offset + size) + module_tags.size = offset + size; + + return (struct alloc_tag *)(module_tags.start_addr + offset); +} + +static void release_module_tags(struct module *mod, bool used) +{ + MA_STATE(mas, &mod_area_mt, module_tags.size, module_tags.size); + struct alloc_tag *tag; + struct module *val; + + mas_lock(&mas); + mas_for_each_rev(&mas, val, 0) + if (val == mod) + break; + + if (!val) /* module not found */ + goto out; + + if (!used) + goto release_area; + + /* Find out if the area is used */ + tag = find_used_tag((struct alloc_tag *)(module_tags.start_addr + mas.index), + (struct alloc_tag *)(module_tags.start_addr + mas.last)); + if (tag) { + struct alloc_tag_counters counter = alloc_tag_read(tag); + + pr_info("%s:%u module %s func:%s has %llu allocated at module unload\n", + tag->ct.filename, tag->ct.lineno, tag->ct.modname, + tag->ct.function, counter.bytes); + } else { + used = false; + } +release_area: + mas_store(&mas, used ? &unloaded_mod : NULL); + val = mas_prev_range(&mas, 0); + if (val == &prepend_mod) + mas_store(&mas, NULL); +out: + mas_unlock(&mas); +} + +static void replace_module(struct module *mod, struct module *new_mod) +{ + MA_STATE(mas, &mod_area_mt, 0, module_tags.size); + struct module *val; + + mas_lock(&mas); + mas_for_each(&mas, val, module_tags.size) { + if (val != mod) + continue; + + mas_store_gfp(&mas, new_mod, GFP_KERNEL); + break; } + mas_unlock(&mas); +} + +static int __init alloc_mod_tags_mem(void) +{ + /* Allocate space to copy allocation tags */ + module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, + MODULE_ALLOC_TAG_VMAP_SIZE); + if (!module_tags.start_addr) + return -ENOMEM; + + module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + + return 0; +} - return module_unused; +static void __init free_mod_tags_mem(void) +{ + execmem_free((void *)module_tags.start_addr); + module_tags.start_addr = 0; } +#else /* CONFIG_MODULES */ + +static inline int alloc_mod_tags_mem(void) { return 0; } +static inline void free_mod_tags_mem(void) {} + +#endif /* CONFIG_MODULES */ + static int __init setup_early_mem_profiling(char *str) { bool enable; @@ -274,14 +471,26 @@ static inline void sysctl_init(void) {} static int __init alloc_tag_init(void) { const struct codetag_type_desc desc = { - .section = "alloc_tags", - .tag_size = sizeof(struct alloc_tag), - .module_unload = alloc_tag_module_unload, + .section = ALLOC_TAG_SECTION_NAME, + .tag_size = sizeof(struct alloc_tag), +#ifdef CONFIG_MODULES + .needs_section_mem = needs_section_mem, + .alloc_section_mem = reserve_module_tags, + .free_section_mem = release_module_tags, + .module_replaced = replace_module, +#endif }; + int res; + + res = alloc_mod_tags_mem(); + if (res) + return res; alloc_tag_cttype = codetag_register_type(&desc); - if (IS_ERR(alloc_tag_cttype)) + if (IS_ERR(alloc_tag_cttype)) { + free_mod_tags_mem(); return PTR_ERR(alloc_tag_cttype); + } sysctl_init(); procfs_init(); diff --git a/lib/codetag.c b/lib/codetag.c index d1fbbb7c2ec3d..7455b966cae43 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -207,6 +207,94 @@ static int codetag_module_init(struct codetag_type *cttype, struct module *mod) } #ifdef CONFIG_MODULES +#define CODETAG_SECTION_PREFIX ".codetag." + +/* Some codetag types need a separate module section */ +bool codetag_needs_module_section(struct module *mod, const char *name, + unsigned long size) +{ + const char *type_name; + struct codetag_type *cttype; + bool ret = false; + + if (strncmp(name, CODETAG_SECTION_PREFIX, strlen(CODETAG_SECTION_PREFIX))) + return false; + + type_name = name + strlen(CODETAG_SECTION_PREFIX); + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (!cttype->desc.needs_section_mem) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.needs_section_mem(mod, size); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void *codetag_alloc_module_section(struct module *mod, const char *name, + unsigned long size, unsigned int prepend, + unsigned long align) +{ + const char *type_name = name + strlen(CODETAG_SECTION_PREFIX); + struct codetag_type *cttype; + void *ret = ERR_PTR(-EINVAL); + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (strcmp(type_name, cttype->desc.section) == 0) { + if (WARN_ON(!cttype->desc.alloc_section_mem)) + break; + + down_write(&cttype->mod_lock); + ret = cttype->desc.alloc_section_mem(mod, size, prepend, align); + up_write(&cttype->mod_lock); + break; + } + } + mutex_unlock(&codetag_lock); + + return ret; +} + +void codetag_free_module_sections(struct module *mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.free_section_mem) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.free_section_mem(mod, false); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + +void codetag_module_replaced(struct module *mod, struct module *new_mod) +{ + struct codetag_type *cttype; + + mutex_lock(&codetag_lock); + list_for_each_entry(cttype, &codetag_types, link) { + if (!cttype->desc.module_replaced) + continue; + + down_write(&cttype->mod_lock); + cttype->desc.module_replaced(mod, new_mod); + up_write(&cttype->mod_lock); + } + mutex_unlock(&codetag_lock); +} + void codetag_load_module(struct module *mod) { struct codetag_type *cttype; @@ -220,13 +308,12 @@ void codetag_load_module(struct module *mod) mutex_unlock(&codetag_lock); } -bool codetag_unload_module(struct module *mod) +void codetag_unload_module(struct module *mod) { struct codetag_type *cttype; - bool unload_ok = true; if (!mod) - return true; + return; /* await any module's kfree_rcu() operations to complete */ kvfree_rcu_barrier(); @@ -246,18 +333,17 @@ bool codetag_unload_module(struct module *mod) } if (found) { if (cttype->desc.module_unload) - if (!cttype->desc.module_unload(cttype, cmod)) - unload_ok = false; + cttype->desc.module_unload(cttype, cmod); cttype->count -= range_size(cttype, &cmod->range); idr_remove(&cttype->mod_idr, mod_id); kfree(cmod); } up_write(&cttype->mod_lock); + if (found && cttype->desc.free_section_mem) + cttype->desc.free_section_mem(mod, true); } mutex_unlock(&codetag_lock); - - return unload_ok; } #endif /* CONFIG_MODULES */ diff --git a/scripts/module.lds.S b/scripts/module.lds.S index 3f43edef813cb..711c6e0299365 100644 --- a/scripts/module.lds.S +++ b/scripts/module.lds.S @@ -50,7 +50,7 @@ SECTIONS { .data : { *(.data .data.[0-9a-zA-Z_]*) *(.data..L*) - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } .rodata : { @@ -59,9 +59,10 @@ SECTIONS { } #else .data : { - CODETAG_SECTIONS() + MOD_CODETAG_SECTIONS() } #endif + MOD_SEPARATE_CODETAG_SECTIONS() } /* bring in arch-specific sections */ From 0f9b685626daa2f8e19a9788625c9b624c223e45 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:57 -0700 Subject: [PATCH 138/215] alloc_tag: populate memory for module tags as needed The memory reserved for module tags does not need to be backed by physical pages until there are tags to store there. Change the way we reserve this memory to allocate only virtual area for the tags and populate it with physical pages as needed when we load a module. [surenb@google.com: avoid execmem_vmap() when !MMU] Link: https://lkml.kernel.org/r/20241031233611.3833002-1-surenb@google.com Link: https://lkml.kernel.org/r/20241023170759.999909-5-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/execmem.h | 12 +++++++ include/linux/vmalloc.h | 3 ++ lib/Kconfig.debug | 1 + lib/alloc_tag.c | 73 ++++++++++++++++++++++++++++++++++++----- mm/execmem.c | 16 +++++++++ mm/internal.h | 6 ++++ mm/vmalloc.c | 4 +-- 7 files changed, 104 insertions(+), 11 deletions(-) diff --git a/include/linux/execmem.h b/include/linux/execmem.h index 1517fa196bf73..64130ae19690a 100644 --- a/include/linux/execmem.h +++ b/include/linux/execmem.h @@ -139,6 +139,18 @@ void *execmem_alloc(enum execmem_type type, size_t size); */ void execmem_free(void *ptr); +#ifdef CONFIG_MMU +/** + * execmem_vmap - create virtual mapping for EXECMEM_MODULE_DATA memory + * @size: size of the virtual mapping in bytes + * + * Maps virtually contiguous area in the range suitable for EXECMEM_MODULE_DATA. + * + * Return: the area descriptor on success or %NULL on failure. + */ +struct vm_struct *execmem_vmap(size_t size); +#endif + /** * execmem_update_copy - copy an update to executable memory * @dst: destination address to update diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 27408f21e501e..31e9ffd936e39 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -202,6 +202,9 @@ extern int remap_vmalloc_range_partial(struct vm_area_struct *vma, extern int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, unsigned long pgoff); +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, + struct page **pages, unsigned int page_shift); + /* * Architectures can set this mask to a combination of PGTBL_P?D_MODIFIED values * and let generic vmalloc and ioremap code know when arch_sync_kernel_mappings() diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 7312ae7c3cc57..6798bbbcbd321 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -993,6 +993,7 @@ config CODE_TAGGING config MEM_ALLOC_PROFILING bool "Enable memory allocation profiling" default n + depends on MMU depends on PROC_FS depends on !DEBUG_FORCE_WEAK_PER_CPU select CODE_TAGGING diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 5f9cd1642d58e..4a7fc081b7899 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -8,14 +8,15 @@ #include #include #include +#include #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT -static bool mem_profiling_support __meminitdata = true; +static bool mem_profiling_support = true; #else -static bool mem_profiling_support __meminitdata; +static bool mem_profiling_support; #endif static struct codetag_type *alloc_tag_cttype; @@ -154,7 +155,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void __init shutdown_mem_profiling(void) +static void shutdown_mem_profiling(void) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -179,6 +180,7 @@ static void __init procfs_init(void) #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); +static struct vm_struct *vm_module_tags; /* A dummy object used to indicate an unloaded module */ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ @@ -252,6 +254,33 @@ static bool find_aligned_area(struct ma_state *mas, unsigned long section_size, return false; } +static int vm_module_tags_populate(void) +{ + unsigned long phys_size = vm_module_tags->nr_pages << PAGE_SHIFT; + + if (phys_size < module_tags.size) { + struct page **next_page = vm_module_tags->pages + vm_module_tags->nr_pages; + unsigned long addr = module_tags.start_addr + phys_size; + unsigned long more_pages; + unsigned long nr; + + more_pages = ALIGN(module_tags.size - phys_size, PAGE_SIZE) >> PAGE_SHIFT; + nr = alloc_pages_bulk_array_node(GFP_KERNEL | __GFP_NOWARN, + NUMA_NO_NODE, more_pages, next_page); + if (nr < more_pages || + vmap_pages_range(addr, addr + (nr << PAGE_SHIFT), PAGE_KERNEL, + next_page, PAGE_SHIFT) < 0) { + /* Clean up and error out */ + for (int i = 0; i < nr; i++) + __free_page(next_page[i]); + return -ENOMEM; + } + vm_module_tags->nr_pages += nr; + } + + return 0; +} + static void *reserve_module_tags(struct module *mod, unsigned long size, unsigned int prepend, unsigned long align) { @@ -310,8 +339,18 @@ static void *reserve_module_tags(struct module *mod, unsigned long size, if (IS_ERR(ret)) return ret; - if (module_tags.size < offset + size) + if (module_tags.size < offset + size) { + int grow_res; + module_tags.size = offset + size; + grow_res = vm_module_tags_populate(); + if (grow_res) { + shutdown_mem_profiling(); + pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", + mod->name); + return ERR_PTR(grow_res); + } + } return (struct alloc_tag *)(module_tags.start_addr + offset); } @@ -372,12 +411,23 @@ static void replace_module(struct module *mod, struct module *new_mod) static int __init alloc_mod_tags_mem(void) { - /* Allocate space to copy allocation tags */ - module_tags.start_addr = (unsigned long)execmem_alloc(EXECMEM_MODULE_DATA, - MODULE_ALLOC_TAG_VMAP_SIZE); - if (!module_tags.start_addr) + /* Map space to copy allocation tags */ + vm_module_tags = execmem_vmap(MODULE_ALLOC_TAG_VMAP_SIZE); + if (!vm_module_tags) { + pr_err("Failed to map %lu bytes for module allocation tags\n", + MODULE_ALLOC_TAG_VMAP_SIZE); + module_tags.start_addr = 0; return -ENOMEM; + } + vm_module_tags->pages = kmalloc_array(get_vm_area_size(vm_module_tags) >> PAGE_SHIFT, + sizeof(struct page *), GFP_KERNEL | __GFP_ZERO); + if (!vm_module_tags->pages) { + free_vm_area(vm_module_tags); + return -ENOMEM; + } + + module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; return 0; @@ -385,8 +435,13 @@ static int __init alloc_mod_tags_mem(void) static void __init free_mod_tags_mem(void) { - execmem_free((void *)module_tags.start_addr); + int i; + module_tags.start_addr = 0; + for (i = 0; i < vm_module_tags->nr_pages; i++) + __free_page(vm_module_tags->pages[i]); + kfree(vm_module_tags->pages); + free_vm_area(vm_module_tags); } #else /* CONFIG_MODULES */ diff --git a/mm/execmem.c b/mm/execmem.c index 576a57e2161f9..317b6a8d35be0 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -64,6 +64,22 @@ static void *execmem_vmalloc(struct execmem_range *range, size_t size, return p; } + +struct vm_struct *execmem_vmap(size_t size) +{ + struct execmem_range *range = &execmem_info->ranges[EXECMEM_MODULE_DATA]; + struct vm_struct *area; + + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->start, range->end, NUMA_NO_NODE, + GFP_KERNEL, __builtin_return_address(0)); + if (!area && range->fallback_start) + area = __get_vm_area_node(size, range->alignment, PAGE_SHIFT, VM_ALLOC, + range->fallback_start, range->fallback_end, + NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); + + return area; +} #else static void *execmem_vmalloc(struct execmem_range *range, size_t size, pgprot_t pgprot, unsigned long vm_flags) diff --git a/mm/internal.h b/mm/internal.h index 3dc745ba76dd4..cd96848be245c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1263,6 +1263,12 @@ int numa_migrate_check(struct folio *folio, struct vm_fault *vmf, void free_zone_device_folio(struct folio *folio); int migrate_device_coherent_folio(struct folio *folio); +struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long align, unsigned long shift, + unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, + const void *caller); + /* * mm/gup.c */ diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 74c0a5eae2102..7ed39d1042015 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -653,7 +653,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, * RETURNS: * 0 on success, -errno on failure. */ -static int vmap_pages_range(unsigned long addr, unsigned long end, +int vmap_pages_range(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { int err; @@ -3106,7 +3106,7 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) vm->flags &= ~VM_UNINITIALIZED; } -static struct vm_struct *__get_vm_area_node(unsigned long size, +struct vm_struct *__get_vm_area_node(unsigned long size, unsigned long align, unsigned long shift, unsigned long flags, unsigned long start, unsigned long end, int node, gfp_t gfp_mask, const void *caller) From 42895a86124418d8dd29a93812bc282e569ccfee Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:58 -0700 Subject: [PATCH 139/215] alloc_tag: introduce pgtag_ref_handle to abstract page tag references To simplify later changes to page tag references, introduce new pgtag_ref_handle type. This allows easy replacement of page_ext as a storage of page allocation tags. Link: https://lkml.kernel.org/r/20241023170759.999909-6-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- include/linux/mm.h | 25 +++++----- include/linux/pgalloc_tag.h | 92 ++++++++++++++++++++++--------------- 2 files changed, 67 insertions(+), 50 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb070c14e3099..f9120ac6d901a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4181,37 +4181,38 @@ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new return; for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union codetag_ref *ref = get_page_tag_ref(folio_page(folio, i)); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { /* Set new reference to point to the original tag */ - alloc_tag_ref_set(ref, tag); - put_page_tag_ref(ref); + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) { + union pgtag_ref_handle handle; + union codetag_ref ref; struct alloc_tag *tag; - union codetag_ref *ref; tag = pgalloc_tag_get(&old->page); if (!tag) return; - ref = get_page_tag_ref(&new->page); - if (!ref) + if (!get_page_tag_ref(&new->page, &ref, &handle)) return; /* Clear the old ref to the original allocation tag. */ clear_page_tag_ref(&old->page); /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(ref, folio_size(new)); - - __alloc_tag_ref_set(ref, tag); - - put_page_tag_ref(ref); + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } #else /* !CONFIG_MEM_ALLOC_PROFILING */ static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 59a3deb792a8d..b13cd3313a882 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,46 +11,59 @@ #include +union pgtag_ref_handle { + union codetag_ref *ref; /* reference in page extension */ +}; + extern struct page_ext_operations page_alloc_tagging_ops; -static inline union codetag_ref *codetag_ref_from_page_ext(struct page_ext *page_ext) +/* Should be called only if mem_alloc_profiling_enabled() */ +static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, + union pgtag_ref_handle *handle) { - return (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); -} + struct page_ext *page_ext; + union codetag_ref *tmp; -static inline struct page_ext *page_ext_from_codetag_ref(union codetag_ref *ref) -{ - return (void *)ref - page_alloc_tagging_ops.offset; + if (!page) + return false; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + return true; } -/* Should be called only if mem_alloc_profiling_enabled() */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) +static inline void put_page_tag_ref(union pgtag_ref_handle handle) { - if (page) { - struct page_ext *page_ext = page_ext_get(page); + if (WARN_ON(!handle.ref)) + return; - if (page_ext) - return codetag_ref_from_page_ext(page_ext); - } - return NULL; + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void put_page_tag_ref(union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, + union codetag_ref *ref) { - if (WARN_ON(!ref)) + if (WARN_ON(!handle.ref || !ref)) return; - page_ext_put(page_ext_from_codetag_ref(ref)); + handle.ref->ct = ref->ct; } static inline void clear_page_tag_ref(struct page *page) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - set_codetag_empty(ref); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + set_codetag_empty(&ref); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -59,11 +72,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_add(ref, task->alloc_tag, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_add(&ref, task->alloc_tag, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -71,11 +86,13 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) { if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); + union pgtag_ref_handle handle; + union codetag_ref ref; - if (ref) { - alloc_tag_sub(ref, PAGE_SIZE * nr); - put_page_tag_ref(ref); + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub(&ref, PAGE_SIZE * nr); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); } } } @@ -85,13 +102,14 @@ static inline struct alloc_tag *pgalloc_tag_get(struct page *page) struct alloc_tag *tag = NULL; if (mem_alloc_profiling_enabled()) { - union codetag_ref *ref = get_page_tag_ref(page); - - alloc_tag_sub_check(ref); - if (ref) { - if (ref->ct) - tag = ct_to_alloc_tag(ref->ct); - put_page_tag_ref(ref); + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(page, &ref, &handle)) { + alloc_tag_sub_check(&ref); + if (ref.ct) + tag = ct_to_alloc_tag(ref.ct); + put_page_tag_ref(handle); } } @@ -106,8 +124,6 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) #else /* CONFIG_MEM_ALLOC_PROFILING */ -static inline union codetag_ref *get_page_tag_ref(struct page *page) { return NULL; } -static inline void put_page_tag_ref(union codetag_ref *ref) {} static inline void clear_page_tag_ref(struct page *page) {} static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, unsigned int nr) {} From 4835f747d3ed181bf2c67930fe06b2c01a5d2323 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Wed, 23 Oct 2024 10:07:59 -0700 Subject: [PATCH 140/215] alloc_tag: support for page allocation tag compression Implement support for storing page allocation tag references directly in the page flags instead of page extensions. sysctl.vm.mem_profiling boot parameter it extended to provide a way for a user to request this mode. Enabling compression eliminates memory overhead caused by page_ext and results in better performance for page allocations. However this mode will not work if the number of available page flag bits is insufficient to address all kernel allocations. Such condition can happen during boot or when loading a module. If this condition is detected, memory allocation profiling gets disabled with an appropriate warning. By default compression mode is disabled. Link: https://lkml.kernel.org/r/20241023170759.999909-7-surenb@google.com Signed-off-by: Suren Baghdasaryan Reviewed-by: Pasha Tatashin Cc: Ard Biesheuvel Cc: Arnd Bergmann Cc: Borislav Petkov (AMD) Cc: Christoph Hellwig Cc: Daniel Gomez Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: David Rientjes Cc: Dennis Zhou Cc: Johannes Weiner Cc: John Hubbard Cc: Jonathan Corbet Cc: Joonsoo Kim Cc: Kalesh Singh Cc: Kees Cook Cc: Kent Overstreet Cc: Liam R. Howlett Cc: Luis Chamberlain Cc: Matthew Wilcox Cc: Michal Hocko Cc: Mike Rapoport (Microsoft) Cc: Minchan Kim Cc: Paul E. McKenney Cc: Petr Pavlu Cc: Roman Gushchin Cc: Sami Tolvanen Cc: Sourav Panda Cc: Steven Rostedt (Google) Cc: Thomas Gleixner Cc: Thomas Huth Cc: Uladzislau Rezki (Sony) Cc: Vlastimil Babka Cc: Xiongwei Song Cc: Yu Zhao Signed-off-by: Andrew Morton --- Documentation/mm/allocation-profiling.rst | 7 +- include/linux/alloc_tag.h | 10 +- include/linux/codetag.h | 3 + include/linux/page-flags-layout.h | 7 ++ include/linux/pgalloc_tag.h | 145 +++++++++++++++++++--- lib/alloc_tag.c | 142 +++++++++++++++++++-- lib/codetag.c | 4 +- mm/mm_init.c | 5 +- 8 files changed, 290 insertions(+), 33 deletions(-) diff --git a/Documentation/mm/allocation-profiling.rst b/Documentation/mm/allocation-profiling.rst index ffd6655b7be22..316311240e6aa 100644 --- a/Documentation/mm/allocation-profiling.rst +++ b/Documentation/mm/allocation-profiling.rst @@ -18,12 +18,17 @@ kconfig options: missing annotation Boot parameter: - sysctl.vm.mem_profiling=0|1|never + sysctl.vm.mem_profiling={0|1|never}[,compressed] When set to "never", memory allocation profiling overhead is minimized and it cannot be enabled at runtime (sysctl becomes read-only). When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=y, default value is "1". When CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT=n, default value is "never". + "compressed" optional parameter will try to store page tag references in a + compact format, avoiding page extensions. This results in improved performance + and memory consumption, however it might fail depending on system configuration. + If compression fails, a warning is issued and memory allocation profiling gets + disabled. sysctl: /proc/sys/vm/mem_profiling diff --git a/include/linux/alloc_tag.h b/include/linux/alloc_tag.h index 55d30543c4c7d..7c0786bdf9af0 100644 --- a/include/linux/alloc_tag.h +++ b/include/linux/alloc_tag.h @@ -30,8 +30,16 @@ struct alloc_tag { struct alloc_tag_counters __percpu *counters; } __aligned(8); +struct alloc_tag_kernel_section { + struct alloc_tag *first_tag; + unsigned long count; +}; + struct alloc_tag_module_section { - unsigned long start_addr; + union { + unsigned long start_addr; + struct alloc_tag *first_tag; + }; unsigned long end_addr; /* used size */ unsigned long size; diff --git a/include/linux/codetag.h b/include/linux/codetag.h index d10bd9810d321..d14dbd26b3708 100644 --- a/include/linux/codetag.h +++ b/include/linux/codetag.h @@ -13,6 +13,9 @@ struct codetag_module; struct seq_buf; struct module; +#define CODETAG_SECTION_START_PREFIX "__start_" +#define CODETAG_SECTION_STOP_PREFIX "__stop_" + /* * An instance of this structure is created in a special ELF section at every * code location being tagged. At runtime, the special section is treated as diff --git a/include/linux/page-flags-layout.h b/include/linux/page-flags-layout.h index 7d79818dc0651..4f5c9e979bb9a 100644 --- a/include/linux/page-flags-layout.h +++ b/include/linux/page-flags-layout.h @@ -111,5 +111,12 @@ ZONES_WIDTH - LRU_GEN_WIDTH - SECTIONS_WIDTH - \ NODES_WIDTH - KASAN_TAG_WIDTH - LAST_CPUPID_WIDTH) +#define NR_NON_PAGEFLAG_BITS (SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH + \ + LAST_CPUPID_SHIFT + KASAN_TAG_WIDTH + \ + LRU_GEN_WIDTH + LRU_REFS_WIDTH) + +#define NR_UNUSED_PAGEFLAG_BITS (BITS_PER_LONG - \ + (NR_NON_PAGEFLAG_BITS + NR_PAGEFLAGS)) + #endif #endif /* _LINUX_PAGE_FLAGS_LAYOUT */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index b13cd3313a882..1fe63b52e5e55 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -11,29 +11,118 @@ #include +extern struct page_ext_operations page_alloc_tagging_ops; +extern unsigned long alloc_tag_ref_mask; +extern int alloc_tag_ref_offs; +extern struct alloc_tag_kernel_section kernel_tags; + +DECLARE_STATIC_KEY_FALSE(mem_profiling_compressed); + +typedef u16 pgalloc_tag_idx; + union pgtag_ref_handle { union codetag_ref *ref; /* reference in page extension */ + struct page *page; /* reference in page flags */ }; -extern struct page_ext_operations page_alloc_tagging_ops; +/* Reserved indexes */ +#define CODETAG_ID_NULL 0 +#define CODETAG_ID_EMPTY 1 +#define CODETAG_ID_FIRST 2 + +#ifdef CONFIG_MODULES + +extern struct alloc_tag_module_section module_tags; + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + return &module_tags.first_tag[idx - kernel_tags.count]; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + return CODETAG_ID_FIRST + kernel_tags.count + (tag - module_tags.first_tag); +} + +#else /* CONFIG_MODULES */ + +static inline struct alloc_tag *module_idx_to_tag(pgalloc_tag_idx idx) +{ + pr_warn("invalid page tag reference %lu\n", (unsigned long)idx); + return NULL; +} + +static inline pgalloc_tag_idx module_tag_to_idx(struct alloc_tag *tag) +{ + pr_warn("invalid page tag 0x%lx\n", (unsigned long)tag); + return CODETAG_ID_NULL; +} + +#endif /* CONFIG_MODULES */ + +static inline void idx_to_ref(pgalloc_tag_idx idx, union codetag_ref *ref) +{ + switch (idx) { + case (CODETAG_ID_NULL): + ref->ct = NULL; + break; + case (CODETAG_ID_EMPTY): + set_codetag_empty(ref); + break; + default: + idx -= CODETAG_ID_FIRST; + ref->ct = idx < kernel_tags.count ? + &kernel_tags.first_tag[idx].ct : + &module_idx_to_tag(idx)->ct; + break; + } +} + +static inline pgalloc_tag_idx ref_to_idx(union codetag_ref *ref) +{ + struct alloc_tag *tag; + + if (!ref->ct) + return CODETAG_ID_NULL; + + if (is_codetag_empty(ref)) + return CODETAG_ID_EMPTY; + + tag = ct_to_alloc_tag(ref->ct); + if (tag >= kernel_tags.first_tag && tag < kernel_tags.first_tag + kernel_tags.count) + return CODETAG_ID_FIRST + (tag - kernel_tags.first_tag); + + return module_tag_to_idx(tag); +} + + /* Should be called only if mem_alloc_profiling_enabled() */ static inline bool get_page_tag_ref(struct page *page, union codetag_ref *ref, union pgtag_ref_handle *handle) { - struct page_ext *page_ext; - union codetag_ref *tmp; - if (!page) return false; - page_ext = page_ext_get(page); - if (!page_ext) - return false; + if (static_key_enabled(&mem_profiling_compressed)) { + pgalloc_tag_idx idx; + + idx = (page->flags >> alloc_tag_ref_offs) & alloc_tag_ref_mask; + idx_to_ref(idx, ref); + handle->page = page; + } else { + struct page_ext *page_ext; + union codetag_ref *tmp; + + page_ext = page_ext_get(page); + if (!page_ext) + return false; + + tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); + ref->ct = tmp->ct; + handle->ref = tmp; + } - tmp = (union codetag_ref *)page_ext_data(page_ext, &page_alloc_tagging_ops); - ref->ct = tmp->ct; - handle->ref = tmp; return true; } @@ -42,16 +131,35 @@ static inline void put_page_tag_ref(union pgtag_ref_handle handle) if (WARN_ON(!handle.ref)) return; - page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); + if (!static_key_enabled(&mem_profiling_compressed)) + page_ext_put((void *)handle.ref - page_alloc_tagging_ops.offset); } -static inline void update_page_tag_ref(union pgtag_ref_handle handle, - union codetag_ref *ref) +static inline void update_page_tag_ref(union pgtag_ref_handle handle, union codetag_ref *ref) { - if (WARN_ON(!handle.ref || !ref)) - return; - - handle.ref->ct = ref->ct; + if (static_key_enabled(&mem_profiling_compressed)) { + struct page *page = handle.page; + unsigned long old_flags; + unsigned long flags; + unsigned long idx; + + if (WARN_ON(!page || !ref)) + return; + + idx = (unsigned long)ref_to_idx(ref); + idx = (idx & alloc_tag_ref_mask) << alloc_tag_ref_offs; + do { + old_flags = READ_ONCE(page->flags); + flags = old_flags; + flags &= ~(alloc_tag_ref_mask << alloc_tag_ref_offs); + flags |= idx; + } while (unlikely(!try_cmpxchg(&page->flags, &old_flags, flags))); + } else { + if (WARN_ON(!handle.ref || !ref)) + return; + + handle.ref->ct = ref->ct; + } } static inline void clear_page_tag_ref(struct page *page) @@ -122,6 +230,8 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void __init alloc_tag_sec_init(void); + #else /* CONFIG_MEM_ALLOC_PROFILING */ static inline void clear_page_tag_ref(struct page *page) {} @@ -130,6 +240,7 @@ static inline void pgalloc_tag_add(struct page *page, struct task_struct *task, static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} +static inline void alloc_tag_sec_init(void) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index 4a7fc081b7899..d38a4b2a551da 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include @@ -12,6 +13,8 @@ #define ALLOCINFO_FILE_NAME "allocinfo" #define MODULE_ALLOC_TAG_VMAP_SIZE (100000UL * sizeof(struct alloc_tag)) +#define SECTION_START(NAME) (CODETAG_SECTION_START_PREFIX NAME) +#define SECTION_STOP(NAME) (CODETAG_SECTION_STOP_PREFIX NAME) #ifdef CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT static bool mem_profiling_support = true; @@ -26,6 +29,11 @@ EXPORT_SYMBOL(_shared_alloc_tag); DEFINE_STATIC_KEY_MAYBE(CONFIG_MEM_ALLOC_PROFILING_ENABLED_BY_DEFAULT, mem_alloc_profiling_key); +DEFINE_STATIC_KEY_FALSE(mem_profiling_compressed); + +struct alloc_tag_kernel_section kernel_tags = { NULL, 0 }; +unsigned long alloc_tag_ref_mask; +int alloc_tag_ref_offs; struct allocinfo_private { struct codetag_iterator iter; @@ -155,7 +163,7 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } -static void shutdown_mem_profiling(void) +static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) static_branch_disable(&mem_alloc_profiling_key); @@ -163,6 +171,8 @@ static void shutdown_mem_profiling(void) if (!mem_profiling_support) return; + if (remove_file) + remove_proc_entry(ALLOCINFO_FILE_NAME, NULL); mem_profiling_support = false; } @@ -173,10 +183,40 @@ static void __init procfs_init(void) if (!proc_create_seq(ALLOCINFO_FILE_NAME, 0400, NULL, &allocinfo_seq_op)) { pr_err("Failed to create %s file\n", ALLOCINFO_FILE_NAME); - shutdown_mem_profiling(); + shutdown_mem_profiling(false); } } +void __init alloc_tag_sec_init(void) +{ + struct alloc_tag *last_codetag; + + if (!mem_profiling_support) + return; + + if (!static_key_enabled(&mem_profiling_compressed)) + return; + + kernel_tags.first_tag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_START(ALLOC_TAG_SECTION_NAME)); + last_codetag = (struct alloc_tag *)kallsyms_lookup_name( + SECTION_STOP(ALLOC_TAG_SECTION_NAME)); + kernel_tags.count = last_codetag - kernel_tags.first_tag; + + /* Check if kernel tags fit into page flags */ + if (kernel_tags.count > (1UL << NR_UNUSED_PAGEFLAG_BITS)) { + shutdown_mem_profiling(false); /* allocinfo file does not exist yet */ + pr_err("%lu allocation tags cannot be references using %d available page flag bits. Memory allocation profiling is disabled!\n", + kernel_tags.count, NR_UNUSED_PAGEFLAG_BITS); + return; + } + + alloc_tag_ref_offs = (LRU_REFS_PGOFF - NR_UNUSED_PAGEFLAG_BITS); + alloc_tag_ref_mask = ((1UL << NR_UNUSED_PAGEFLAG_BITS) - 1); + pr_debug("Memory allocation profiling compression is using %d page flag bits!\n", + NR_UNUSED_PAGEFLAG_BITS); +} + #ifdef CONFIG_MODULES static struct maple_tree mod_area_mt = MTREE_INIT(mod_area_mt, MT_FLAGS_ALLOC_RANGE); @@ -186,10 +226,59 @@ static struct module unloaded_mod; /* A dummy object used to indicate a module prepended area */ static struct module prepend_mod; -static struct alloc_tag_module_section module_tags; +struct alloc_tag_module_section module_tags; + +static inline unsigned long alloc_tag_align(unsigned long val) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return val; + } + + if (val % sizeof(struct alloc_tag) == 0) + return val; + return ((val / sizeof(struct alloc_tag)) + 1) * sizeof(struct alloc_tag); +} + +static bool ensure_alignment(unsigned long align, unsigned int *prepend) +{ + if (!static_key_enabled(&mem_profiling_compressed)) { + /* No alignment requirements when we are not indexing the tags */ + return true; + } + + /* + * If alloc_tag size is not a multiple of required alignment, tag + * indexing does not work. + */ + if (!IS_ALIGNED(sizeof(struct alloc_tag), align)) + return false; + + /* Ensure prepend consumes multiple of alloc_tag-sized blocks */ + if (*prepend) + *prepend = alloc_tag_align(*prepend); + + return true; +} + +static inline bool tags_addressable(void) +{ + unsigned long tag_idx_count; + + if (!static_key_enabled(&mem_profiling_compressed)) + return true; /* with page_ext tags are always addressable */ + + tag_idx_count = CODETAG_ID_FIRST + kernel_tags.count + + module_tags.size / sizeof(struct alloc_tag); + + return tag_idx_count < (1UL << NR_UNUSED_PAGEFLAG_BITS); +} static bool needs_section_mem(struct module *mod, unsigned long size) { + if (!mem_profiling_support) + return false; + return size >= sizeof(struct alloc_tag); } @@ -300,6 +389,13 @@ static void *reserve_module_tags(struct module *mod, unsigned long size, if (!align) align = 1; + if (!ensure_alignment(align, &prepend)) { + shutdown_mem_profiling(true); + pr_err("%s: alignment %lu is incompatible with allocation tag indexing. Memory allocation profiling is disabled!\n", + mod->name, align); + return ERR_PTR(-EINVAL); + } + mas_lock(&mas); if (!find_aligned_area(&mas, section_size, size, prepend, align)) { ret = ERR_PTR(-ENOMEM); @@ -343,9 +439,15 @@ static void *reserve_module_tags(struct module *mod, unsigned long size, int grow_res; module_tags.size = offset + size; + if (mem_alloc_profiling_enabled() && !tags_addressable()) { + shutdown_mem_profiling(true); + pr_warn("With module %s there are too many tags to fit in %d page flag bits. Memory allocation profiling is disabled!\n", + mod->name, NR_UNUSED_PAGEFLAG_BITS); + } + grow_res = vm_module_tags_populate(); if (grow_res) { - shutdown_mem_profiling(); + shutdown_mem_profiling(true); pr_err("Failed to allocate memory for allocation tags in the module %s. Memory allocation profiling is disabled!\n", mod->name); return ERR_PTR(grow_res); @@ -429,6 +531,8 @@ static int __init alloc_mod_tags_mem(void) module_tags.start_addr = (unsigned long)vm_module_tags->addr; module_tags.end_addr = module_tags.start_addr + MODULE_ALLOC_TAG_VMAP_SIZE; + /* Ensure the base is alloc_tag aligned when required for indexing */ + module_tags.start_addr = alloc_tag_align(module_tags.start_addr); return 0; } @@ -451,8 +555,10 @@ static inline void free_mod_tags_mem(void) {} #endif /* CONFIG_MODULES */ +/* See: Documentation/mm/allocation-profiling.rst */ static int __init setup_early_mem_profiling(char *str) { + bool compressed = false; bool enable; if (!str || !str[0]) @@ -461,22 +567,37 @@ static int __init setup_early_mem_profiling(char *str) if (!strncmp(str, "never", 5)) { enable = false; mem_profiling_support = false; + pr_info("Memory allocation profiling is disabled!\n"); } else { - int res; + char *token = strsep(&str, ","); + + if (kstrtobool(token, &enable)) + return -EINVAL; - res = kstrtobool(str, &enable); - if (res) - return res; + if (str) { + if (strcmp(str, "compressed")) + return -EINVAL; + + compressed = true; + } mem_profiling_support = true; + pr_info("Memory allocation profiling is enabled %s compression and is turned %s!\n", + compressed ? "with" : "without", enable ? "on" : "off"); } - if (enable != static_key_enabled(&mem_alloc_profiling_key)) { + if (enable != mem_alloc_profiling_enabled()) { if (enable) static_branch_enable(&mem_alloc_profiling_key); else static_branch_disable(&mem_alloc_profiling_key); } + if (compressed != static_key_enabled(&mem_profiling_compressed)) { + if (compressed) + static_branch_enable(&mem_profiling_compressed); + else + static_branch_disable(&mem_profiling_compressed); + } return 0; } @@ -484,6 +605,9 @@ early_param("sysctl.vm.mem_profiling", setup_early_mem_profiling); static __init bool need_page_alloc_tagging(void) { + if (static_key_enabled(&mem_profiling_compressed)) + return false; + return mem_profiling_support; } diff --git a/lib/codetag.c b/lib/codetag.c index 7455b966cae43..42aadd6c14549 100644 --- a/lib/codetag.c +++ b/lib/codetag.c @@ -149,8 +149,8 @@ static struct codetag_range get_section_range(struct module *mod, const char *section) { return (struct codetag_range) { - get_symbol(mod, "__start_", section), - get_symbol(mod, "__stop_", section), + get_symbol(mod, CODETAG_SECTION_START_PREFIX, section), + get_symbol(mod, CODETAG_SECTION_STOP_PREFIX, section), }; } diff --git a/mm/mm_init.c b/mm/mm_init.c index 4ba5607aaf194..1c205b0a86ed5 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -83,8 +83,7 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = BITS_PER_LONG; - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; + width = shift - NR_NON_PAGEFLAG_BITS; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", SECTIONS_WIDTH, @@ -2639,7 +2638,7 @@ void __init mm_core_init(void) BUILD_BUG_ON(MAX_ZONELISTS > 2); build_all_zonelists(NULL); page_alloc_init_cpuhp(); - + alloc_tag_sec_init(); /* * page_ext requires contiguous pages, * bigger than MAX_PAGE_ORDER unless SPARSEMEM. From b7fc16a16b0850e41b88eae0edfa4c085c012347 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 24 Oct 2024 09:23:18 -0700 Subject: [PATCH 141/215] mm/codetag: uninline and move pgalloc_tag_copy and pgalloc_tag_split pgalloc_tag_copy() and pgalloc_tag_split() are sizable and outside of any performance-critical paths, so it should be fine to uninline them. Also move their declarations into pgalloc_tag.h which seems like a more appropriate place for them. No functional changes other than uninlining. Link: https://lkml.kernel.org/r/20241024162318.1640781-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Suggested-by: Andrew Morton Acked-by: Yu Zhao Cc: Kent Overstreet Cc: Pasha Tatashin Cc: Sourav Panda Signed-off-by: Andrew Morton --- include/linux/mm.h | 58 ------------------------------------- include/linux/pgalloc_tag.h | 5 ++++ lib/alloc_tag.c | 48 ++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 58 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index f9120ac6d901a..08e487e5850a3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -4166,62 +4166,4 @@ static inline int do_mseal(unsigned long start, size_t len_in, unsigned long fla } #endif -#ifdef CONFIG_MEM_ALLOC_PROFILING -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ - int i; - struct alloc_tag *tag; - unsigned int nr_pages = 1 << new_order; - - if (!mem_alloc_profiling_enabled()) - return; - - tag = pgalloc_tag_get(&folio->page); - if (!tag) - return; - - for (i = nr_pages; i < (1 << old_order); i += nr_pages) { - union pgtag_ref_handle handle; - union codetag_ref ref; - - if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { - /* Set new reference to point to the original tag */ - alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); - } - } -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ - union pgtag_ref_handle handle; - union codetag_ref ref; - struct alloc_tag *tag; - - tag = pgalloc_tag_get(&old->page); - if (!tag) - return; - - if (!get_page_tag_ref(&new->page, &ref, &handle)) - return; - - /* Clear the old ref to the original allocation tag. */ - clear_page_tag_ref(&old->page); - /* Decrement the counters of the tag on get_new_folio. */ - alloc_tag_sub(&ref, folio_size(new)); - __alloc_tag_ref_set(&ref, tag); - update_page_tag_ref(handle, &ref); - put_page_tag_ref(handle); -} -#else /* !CONFIG_MEM_ALLOC_PROFILING */ -static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) -{ -} - -static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) -{ -} -#endif /* CONFIG_MEM_ALLOC_PROFILING */ - #endif /* _LINUX_MM_H */ diff --git a/include/linux/pgalloc_tag.h b/include/linux/pgalloc_tag.h index 1fe63b52e5e55..0e43ab653ab62 100644 --- a/include/linux/pgalloc_tag.h +++ b/include/linux/pgalloc_tag.h @@ -230,6 +230,9 @@ static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) this_cpu_sub(tag->counters->bytes, PAGE_SIZE * nr); } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order); +void pgalloc_tag_copy(struct folio *new, struct folio *old); + void __init alloc_tag_sec_init(void); #else /* CONFIG_MEM_ALLOC_PROFILING */ @@ -241,6 +244,8 @@ static inline void pgalloc_tag_sub(struct page *page, unsigned int nr) {} static inline struct alloc_tag *pgalloc_tag_get(struct page *page) { return NULL; } static inline void pgalloc_tag_sub_pages(struct alloc_tag *tag, unsigned int nr) {} static inline void alloc_tag_sec_init(void) {} +static inline void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) {} +static inline void pgalloc_tag_copy(struct folio *new, struct folio *old) {} #endif /* CONFIG_MEM_ALLOC_PROFILING */ diff --git a/lib/alloc_tag.c b/lib/alloc_tag.c index d38a4b2a551da..2414a7ee7ec77 100644 --- a/lib/alloc_tag.c +++ b/lib/alloc_tag.c @@ -163,6 +163,54 @@ size_t alloc_tag_top_users(struct codetag_bytes *tags, size_t count, bool can_sl return nr; } +void pgalloc_tag_split(struct folio *folio, int old_order, int new_order) +{ + int i; + struct alloc_tag *tag; + unsigned int nr_pages = 1 << new_order; + + if (!mem_alloc_profiling_enabled()) + return; + + tag = pgalloc_tag_get(&folio->page); + if (!tag) + return; + + for (i = nr_pages; i < (1 << old_order); i += nr_pages) { + union pgtag_ref_handle handle; + union codetag_ref ref; + + if (get_page_tag_ref(folio_page(folio, i), &ref, &handle)) { + /* Set new reference to point to the original tag */ + alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); + } + } +} + +void pgalloc_tag_copy(struct folio *new, struct folio *old) +{ + union pgtag_ref_handle handle; + union codetag_ref ref; + struct alloc_tag *tag; + + tag = pgalloc_tag_get(&old->page); + if (!tag) + return; + + if (!get_page_tag_ref(&new->page, &ref, &handle)) + return; + + /* Clear the old ref to the original allocation tag. */ + clear_page_tag_ref(&old->page); + /* Decrement the counters of the tag on get_new_folio. */ + alloc_tag_sub(&ref, folio_size(new)); + __alloc_tag_ref_set(&ref, tag); + update_page_tag_ref(handle, &ref); + put_page_tag_ref(handle); +} + static void shutdown_mem_profiling(bool remove_file) { if (mem_alloc_profiling_enabled()) From 91d0ec834786a4c9e1e0c55f0fffc8c82cd66cd7 Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Tue, 1 Oct 2024 23:23:58 +0530 Subject: [PATCH 142/215] zsmalloc: replace kmap_atomic with kmap_local_page The use of kmap_atomic/kunmap_atomic is deprecated. Replace it will kmap_local_page/kunmap_local all over the place. Also fix SPDX missing license header. WARNING: Missing or malformed SPDX-License-Identifier tag in line 1 WARNING: Deprecated use of 'kmap_atomic', prefer 'kmap_local_page' instead + vaddr = kmap_atomic(page); Link: https://lkml.kernel.org/r/20241001175358.12970-1-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Cc: Joe Perches Cc: Minchan Kim Cc: Pintu Agarwal Cc: Sergey Senozhatsky Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 66 ++++++++++++++++++++++++++------------------------- 1 file changed, 34 insertions(+), 32 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 16a07def09c96..d3ff10160a5f8 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1,3 +1,5 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + /* * zsmalloc memory allocator * @@ -898,7 +900,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) set_first_obj_offset(page, off); - vaddr = kmap_atomic(page); + vaddr = kmap_local_page(page); link = (struct link_free *)vaddr + off / sizeof(*link); while ((off += class->size) < PAGE_SIZE) { @@ -921,7 +923,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) */ link->next = -1UL << OBJ_TAG_BITS; } - kunmap_atomic(vaddr); + kunmap_local(vaddr); page = next_page; off %= PAGE_SIZE; } @@ -1059,12 +1061,12 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - addr = kmap_atomic(pages[0]); + addr = kmap_local_page(pages[0]); memcpy(buf, addr + off, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); + kunmap_local(addr); + addr = kmap_local_page(pages[1]); memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_atomic(addr); + kunmap_local(addr); out: return area->vm_buf; } @@ -1089,12 +1091,12 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - addr = kmap_atomic(pages[0]); + addr = kmap_local_page(pages[0]); memcpy(addr + off, buf, sizes[0]); - kunmap_atomic(addr); - addr = kmap_atomic(pages[1]); + kunmap_local(addr); + addr = kmap_local_page(pages[1]); memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_atomic(addr); + kunmap_local(addr); out: /* enable page faults to match kunmap_atomic() return conditions */ @@ -1223,7 +1225,7 @@ void *zs_map_object(struct zs_pool *pool, unsigned long handle, area->vm_mm = mm; if (off + class->size <= PAGE_SIZE) { /* this object is contained entirely within a page */ - area->vm_addr = kmap_atomic(page); + area->vm_addr = kmap_local_page(page); ret = area->vm_addr + off; goto out; } @@ -1260,7 +1262,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) - kunmap_atomic(area->vm_addr); + kunmap_local(area->vm_addr); else { struct page *pages[2]; @@ -1318,7 +1320,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, for (i = 0; i < nr_page; i++) m_page = get_next_page(m_page); - vaddr = kmap_atomic(m_page); + vaddr = kmap_local_page(m_page); link = (struct link_free *)vaddr + m_offset / sizeof(*link); set_freeobj(zspage, link->next >> OBJ_TAG_BITS); if (likely(!ZsHugePage(zspage))) @@ -1328,7 +1330,7 @@ static unsigned long obj_malloc(struct zs_pool *pool, /* record handle to page->index */ zspage->first_page->index = handle | OBJ_ALLOCATED_TAG; - kunmap_atomic(vaddr); + kunmap_local(vaddr); mod_zspage_inuse(zspage, 1); obj = location_to_obj(m_page, obj); @@ -1419,7 +1421,7 @@ static void obj_free(int class_size, unsigned long obj) f_offset = offset_in_page(class_size * f_objidx); zspage = get_zspage(f_page); - vaddr = kmap_atomic(f_page); + vaddr = kmap_local_page(f_page); link = (struct link_free *)(vaddr + f_offset); /* Insert this object in containing zspage's freelist */ @@ -1429,7 +1431,7 @@ static void obj_free(int class_size, unsigned long obj) f_page->index = 0; set_freeobj(zspage, f_objidx); - kunmap_atomic(vaddr); + kunmap_local(vaddr); mod_zspage_inuse(zspage, -1); } @@ -1492,8 +1494,8 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, if (d_off + class->size > PAGE_SIZE) d_size = PAGE_SIZE - d_off; - s_addr = kmap_atomic(s_page); - d_addr = kmap_atomic(d_page); + s_addr = kmap_local_page(s_page); + d_addr = kmap_local_page(d_page); while (1) { size = min(s_size, d_size); @@ -1516,26 +1518,26 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { - kunmap_atomic(d_addr); - kunmap_atomic(s_addr); + kunmap_local(d_addr); + kunmap_local(s_addr); s_page = get_next_page(s_page); - s_addr = kmap_atomic(s_page); - d_addr = kmap_atomic(d_page); + s_addr = kmap_local_page(s_page); + d_addr = kmap_local_page(d_page); s_size = class->size - written; s_off = 0; } if (d_off >= PAGE_SIZE) { - kunmap_atomic(d_addr); + kunmap_local(d_addr); d_page = get_next_page(d_page); - d_addr = kmap_atomic(d_page); + d_addr = kmap_local_page(d_page); d_size = class->size - written; d_off = 0; } } - kunmap_atomic(d_addr); - kunmap_atomic(s_addr); + kunmap_local(d_addr); + kunmap_local(s_addr); } /* @@ -1548,7 +1550,7 @@ static unsigned long find_alloced_obj(struct size_class *class, unsigned int offset; int index = *obj_idx; unsigned long handle = 0; - void *addr = kmap_atomic(page); + void *addr = kmap_local_page(page); offset = get_first_obj_offset(page); offset += class->size * index; @@ -1561,7 +1563,7 @@ static unsigned long find_alloced_obj(struct size_class *class, index++; } - kunmap_atomic(addr); + kunmap_local(addr); *obj_idx = index; @@ -1798,14 +1800,14 @@ static int zs_page_migrate(struct page *newpage, struct page *page, migrate_write_lock(zspage); offset = get_first_obj_offset(page); - s_addr = kmap_atomic(page); + s_addr = kmap_local_page(page); /* * Here, any user cannot access all objects in the zspage so let's move. */ - d_addr = kmap_atomic(newpage); + d_addr = kmap_local_page(newpage); copy_page(d_addr, s_addr); - kunmap_atomic(d_addr); + kunmap_local(d_addr); for (addr = s_addr + offset; addr < s_addr + PAGE_SIZE; addr += class->size) { @@ -1818,7 +1820,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, record_obj(handle, new_obj); } } - kunmap_atomic(s_addr); + kunmap_local(s_addr); replace_sub_page(class, zspage, newpage, page); /* From e664c2cd98cbf5e6fcc3ee92b5f3fc8f7f35e11e Mon Sep 17 00:00:00 2001 From: Pintu Kumar Date: Thu, 10 Oct 2024 23:21:43 +0530 Subject: [PATCH 143/215] mm/zsmalloc: use memcpy_from/to_page whereever possible As part of "zsmalloc: replace kmap_atomic with kmap_local_page" [1] we replaced kmap/kunmap_atomic() with kmap_local_page()/kunmap_local(). But later it was found that some of the code could be replaced with already available apis in highmem.h, such as memcpy_from_page()/memcpy_to_page(). Also, update the comments with correct api naming. [1] https://lkml.kernel.org/r/20241001175358.12970-1-quic_pintu@quicinc.com Link: https://lkml.kernel.org/r/20241010175143.27262-1-quic_pintu@quicinc.com Signed-off-by: Pintu Kumar Suggested-by: Matthew Wilcox Suggested-by: Sergey Senozhatsky Cc: Joe Perches Cc: Minchan Kim Cc: Pintu Agarwal Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index d3ff10160a5f8..64b66a4d3e6ef 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -263,7 +263,7 @@ struct zspage { struct mapping_area { local_lock_t lock; char *vm_buf; /* copy buffer for objects that span pages */ - char *vm_addr; /* address of kmap_atomic()'ed pages */ + char *vm_addr; /* address of kmap_local_page()'ed pages */ enum zs_mapmode vm_mm; /* mapping mode */ }; @@ -1046,11 +1046,10 @@ static inline void __zs_cpu_down(struct mapping_area *area) static void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - int sizes[2]; - void *addr; + size_t sizes[2]; char *buf = area->vm_buf; - /* disable page faults to match kmap_atomic() return conditions */ + /* disable page faults to match kmap_local_page() return conditions */ pagefault_disable(); /* no read fastpath */ @@ -1061,12 +1060,8 @@ static void *__zs_map_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy object to per-cpu buffer */ - addr = kmap_local_page(pages[0]); - memcpy(buf, addr + off, sizes[0]); - kunmap_local(addr); - addr = kmap_local_page(pages[1]); - memcpy(buf + sizes[0], addr, sizes[1]); - kunmap_local(addr); + memcpy_from_page(buf, pages[0], off, sizes[0]); + memcpy_from_page(buf + sizes[0], pages[1], 0, sizes[1]); out: return area->vm_buf; } @@ -1074,8 +1069,7 @@ static void *__zs_map_object(struct mapping_area *area, static void __zs_unmap_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - int sizes[2]; - void *addr; + size_t sizes[2]; char *buf; /* no write fastpath */ @@ -1091,15 +1085,11 @@ static void __zs_unmap_object(struct mapping_area *area, sizes[1] = size - sizes[0]; /* copy per-cpu buffer to object */ - addr = kmap_local_page(pages[0]); - memcpy(addr + off, buf, sizes[0]); - kunmap_local(addr); - addr = kmap_local_page(pages[1]); - memcpy(addr, buf + sizes[0], sizes[1]); - kunmap_local(addr); + memcpy_to_page(pages[0], off, buf, sizes[0]); + memcpy_to_page(pages[1], 0, buf + sizes[0], sizes[1]); out: - /* enable page faults to match kunmap_atomic() return conditions */ + /* enable page faults to match kunmap_local() return conditions */ pagefault_enable(); } @@ -1511,10 +1501,10 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_size -= size; /* - * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() - * calls must occurs in reverse order of calls to kmap_atomic(). - * So, to call kunmap_atomic(s_addr) we should first call - * kunmap_atomic(d_addr). For more details see + * Calling kunmap_local(d_addr) is necessary. kunmap_local() + * calls must occurs in reverse order of calls to kmap_local_page(). + * So, to call kunmap_local(s_addr) we should first call + * kunmap_local(d_addr). For more details see * Documentation/mm/highmem.rst. */ if (s_off >= PAGE_SIZE) { From f7470591f8db1a72ce9f7ab49cb13c2b21b92002 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:12 +0100 Subject: [PATCH 144/215] mm: convert page_to_pgoff() to page_pgoff() Patch series "page->index removals in mm", v2. As part of shrinking struct page, we need to stop using page->index. This patchset gets rid of most of the remaining references to page->index in mm, as well as increasing the number of functions which take a const folio/page pointer. It shrinks the text segment of mm by a few hundred bytes in my test config, probably mostly from removing calls to compound_head() in page_to_pgoff(). This patch (of 7): Change the function signature to pass in the folio as all three callers have it. This removes a reference to page->index, which we're trying to get rid of. And add kernel-doc. Link: https://lkml.kernel.org/r/20241005200121.3231142-1-willy@infradead.org Link: https://lkml.kernel.org/r/20241005200121.3231142-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- include/linux/pagemap.h | 31 +++++++++++++++++-------------- mm/memory-failure.c | 4 ++-- mm/rmap.c | 2 +- 4 files changed, 21 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 08e487e5850a3..78848fbefe94e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1895,7 +1895,7 @@ static inline unsigned long page_to_section(const struct page *page) * * Return: The Page Frame Number of the first page in the folio. */ -static inline unsigned long folio_pfn(struct folio *folio) +static inline unsigned long folio_pfn(const struct folio *folio) { return page_to_pfn(&folio->page); } diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 68a5f1ff3301c..bcf0865a38ae3 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1011,22 +1011,25 @@ static inline struct folio *read_mapping_folio(struct address_space *mapping, return read_cache_folio(mapping, index, NULL, file); } -/* - * Get the offset in PAGE_SIZE (even for hugetlb pages). +/** + * page_pgoff - Calculate the logical page offset of this page. + * @folio: The folio containing this page. + * @page: The page which we need the offset of. + * + * For file pages, this is the offset from the beginning of the file + * in units of PAGE_SIZE. For anonymous pages, this is the offset from + * the beginning of the anon_vma in units of PAGE_SIZE. This will + * return nonsense for KSM pages. + * + * Context: Caller must have a reference on the folio or otherwise + * prevent it from being split or freed. + * + * Return: The offset in units of PAGE_SIZE. */ -static inline pgoff_t page_to_pgoff(struct page *page) +static inline pgoff_t page_pgoff(const struct folio *folio, + const struct page *page) { - struct page *head; - - if (likely(!PageTransTail(page))) - return page->index; - - head = compound_head(page); - /* - * We don't initialize ->index for tail pages: calculate based on - * head page - */ - return head->index + page - head; + return folio->index + folio_page_idx(folio, page); } /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 96ce31e5a203b..58a3d80961a48 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -617,7 +617,7 @@ static void collect_procs_anon(struct folio *folio, struct page *page, if (av == NULL) /* Not actually mapped anymore */ return; - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); rcu_read_lock(); for_each_process(tsk) { struct vm_area_struct *vma; @@ -653,7 +653,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, i_mmap_lock_read(mapping); rcu_read_lock(); - pgoff = page_to_pgoff(page); + pgoff = page_pgoff(folio, page); for_each_process(tsk) { struct task_struct *t = task_early_kill(tsk, force_early); unsigned long addr; diff --git a/mm/rmap.c b/mm/rmap.c index 4d75433330f97..77c6b27cb0ee5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1276,7 +1276,7 @@ static void __page_check_anon_rmap(struct folio *folio, struct page *page, */ VM_BUG_ON_FOLIO(folio_anon_vma(folio)->root != vma->anon_vma->root, folio); - VM_BUG_ON_PAGE(page_to_pgoff(page) != linear_page_index(vma, address), + VM_BUG_ON_PAGE(page_pgoff(folio, page) != linear_page_index(vma, address), page); } From 7d3e93eca3ca28bb5927b09b9b603c0c995bcd24 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:13 +0100 Subject: [PATCH 145/215] mm: use page_pgoff() in more places There are several places which currently open-code page_pgoff(), convert them to call it. Link: https://lkml.kernel.org/r/20241005200121.3231142-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/futex/core.c | 2 +- mm/page_vma_mapped.c | 3 +-- mm/rmap.c | 4 +--- 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/kernel/futex/core.c b/kernel/futex/core.c index 136768ae26375..342dc4dd328b6 100644 --- a/kernel/futex/core.c +++ b/kernel/futex/core.c @@ -399,7 +399,7 @@ int get_futex_key(u32 __user *uaddr, unsigned int flags, union futex_key *key, key->both.offset |= FUT_OFF_INODE; /* inode-based key */ key->shared.i_seq = get_inode_sequence_number(inode); - key->shared.pgoff = folio->index + folio_page_idx(folio, page); + key->shared.pgoff = page_pgoff(folio, page); rcu_read_unlock(); } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index ab1671e71cb2d..6b356853c04e4 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -340,7 +340,6 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); - pgoff_t pgoff = folio->index + folio_page_idx(folio, page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, @@ -348,7 +347,7 @@ unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .flags = PVMW_SYNC, }; - pvmw.address = vma_address(vma, pgoff, 1); + pvmw.address = vma_address(vma, page_pgoff(folio, page), 1); if (pvmw.address == -EFAULT) goto out; if (!page_vma_mapped_walk(&pvmw)) diff --git a/mm/rmap.c b/mm/rmap.c index 77c6b27cb0ee5..e5ec8304a1934 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -774,7 +774,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { struct folio *folio = page_folio(page); - pgoff_t pgoff; if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); @@ -792,8 +791,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) } /* The !page__anon_vma above handles KSM folios */ - pgoff = folio->index + folio_page_idx(folio, page); - return vma_address(vma, pgoff, 1); + return vma_address(vma, page_pgoff(folio, page), 1); } /* From 713da0b33b3e9d16272b57f4c44dee5c052be9b7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:14 +0100 Subject: [PATCH 146/215] mm: renovate page_address_in_vma() This function doesn't modify any of its arguments, so if we make a few other functions take const pointers, we can make page_address_in_vma() take const pointers too. All of its callers have the containing folio already, so pass that in as an argument instead of recalculating it. Also add kernel-doc Link: https://lkml.kernel.org/r/20241005200121.3231142-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 7 ++----- mm/internal.h | 4 ++-- mm/ksm.c | 7 +++---- mm/memory-failure.c | 2 +- mm/mempolicy.c | 2 +- mm/rmap.c | 27 ++++++++++++++++++++------- mm/util.c | 2 +- 7 files changed, 30 insertions(+), 21 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index d5e93e44322e5..78923015a2e8d 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -728,11 +728,8 @@ page_vma_mapped_walk_restart(struct page_vma_mapped_walk *pvmw) } bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); - -/* - * Used by swapoff to help locate where page is expected in vma. - */ -unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); +unsigned long page_address_in_vma(const struct folio *folio, + const struct page *, const struct vm_area_struct *); /* * Cleans the PTEs of shared mappings. diff --git a/mm/internal.h b/mm/internal.h index cd96848be245c..8674f677304a9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -841,7 +841,7 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) } /* mm/util.c */ -struct anon_vma *folio_anon_vma(struct folio *folio); +struct anon_vma *folio_anon_vma(const struct folio *folio); #ifdef CONFIG_MMU void unmap_mapping_folio(struct folio *folio); @@ -959,7 +959,7 @@ extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); * If any page in this range is mapped by this VMA, return the first address * where any of these pages appear. Otherwise, return -EFAULT. */ -static inline unsigned long vma_address(struct vm_area_struct *vma, +static inline unsigned long vma_address(const struct vm_area_struct *vma, pgoff_t pgoff, unsigned long nr_pages) { unsigned long address; diff --git a/mm/ksm.c b/mm/ksm.c index e596bc1b5fa7a..b813225a806d5 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1256,7 +1256,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct folio *folio, if (WARN_ON_ONCE(folio_test_large(folio))) return err; - pvmw.address = page_address_in_vma(&folio->page, vma); + pvmw.address = page_address_in_vma(folio, folio_page(folio, 0), vma); if (pvmw.address == -EFAULT) goto out; @@ -1340,7 +1340,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct folio *kfolio = page_folio(kpage); struct mm_struct *mm = vma->vm_mm; - struct folio *folio; + struct folio *folio = page_folio(page); pmd_t *pmd; pmd_t pmde; pte_t *ptep; @@ -1350,7 +1350,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, int err = -EFAULT; struct mmu_notifier_range range; - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma(folio, page, vma); if (addr == -EFAULT) goto out; @@ -1416,7 +1416,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at(mm, addr, ptep, newpte); - folio = page_folio(page); folio_remove_rmap_pte(folio, page, vma); if (!folio_mapped(folio)) folio_free_swap(folio); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 58a3d80961a48..ea9d883c01c16 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -671,7 +671,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, */ if (vma->vm_mm != t->mm) continue; - addr = page_address_in_vma(page, vma); + addr = page_address_in_vma(folio, page, vma); add_to_kill_anon_file(t, page, vma, to_kill, addr); } } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a29eff5d0585d..bb37cd1a51d87 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1367,7 +1367,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_entry_is_head(folio, &pagelist, lru)) { vma_iter_init(&vmi, mm, start); for_each_vma_range(vmi, vma, end) { - addr = page_address_in_vma( + addr = page_address_in_vma(folio, folio_page(folio, 0), vma); if (addr != -EFAULT) break; diff --git a/mm/rmap.c b/mm/rmap.c index e5ec8304a1934..d4e5fe94fa922 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -767,14 +767,27 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) } #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */ -/* - * At what user virtual address is page expected in vma? - * Caller should check the page is actually part of the vma. +/** + * page_address_in_vma - The virtual address of a page in this VMA. + * @folio: The folio containing the page. + * @page: The page within the folio. + * @vma: The VMA we need to know the address in. + * + * Calculates the user virtual address of this page in the specified VMA. + * It is the caller's responsibililty to check the page is actually + * within the VMA. There may not currently be a PTE pointing at this + * page, but if a page fault occurs at this address, this is the page + * which will be accessed. + * + * Context: Caller should hold a reference to the folio. Caller should + * hold a lock (eg the i_mmap_lock or the mmap_lock) which keeps the + * VMA from being altered. + * + * Return: The virtual address corresponding to this page in the VMA. */ -unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_address_in_vma(const struct folio *folio, + const struct page *page, const struct vm_area_struct *vma) { - struct folio *folio = page_folio(page); - if (folio_test_anon(folio)) { struct anon_vma *page__anon_vma = folio_anon_vma(folio); /* @@ -790,7 +803,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return -EFAULT; } - /* The !page__anon_vma above handles KSM folios */ + /* KSM folios don't reach here because of the !page__anon_vma check */ return vma_address(vma, page_pgoff(folio, page), 1); } diff --git a/mm/util.c b/mm/util.c index 4f1275023eb73..60017d2a9e489 100644 --- a/mm/util.c +++ b/mm/util.c @@ -820,7 +820,7 @@ void *vcalloc_noprof(size_t n, size_t size) } EXPORT_SYMBOL(vcalloc_noprof); -struct anon_vma *folio_anon_vma(struct folio *folio) +struct anon_vma *folio_anon_vma(const struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; From 68158bfa3dbd4af8461ef75a91ffc03be942c8fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:15 +0100 Subject: [PATCH 147/215] mm: mass constification of folio/page pointers Now that page_pgoff() takes const pointers, we can constify the pointers to a lot of functions. Link: https://lkml.kernel.org/r/20241005200121.3231142-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/ksm.h | 7 ++++--- include/linux/rmap.h | 10 +++++----- mm/internal.h | 5 +++-- mm/ksm.c | 5 +++-- mm/memory-failure.c | 24 +++++++++++++----------- mm/page_vma_mapped.c | 5 +++-- mm/rmap.c | 11 ++++++----- 7 files changed, 37 insertions(+), 30 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 29022e71a074a..6a53ac4885bb4 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -90,7 +90,7 @@ struct folio *ksm_might_need_to_copy(struct folio *folio, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc); void folio_migrate_ksm(struct folio *newfolio, struct folio *folio); -void collect_procs_ksm(struct folio *folio, struct page *page, +void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early); long ksm_process_profit(struct mm_struct *); @@ -122,8 +122,9 @@ static inline void ksm_might_unmap_zero_page(struct mm_struct *mm, pte_t pte) { } -static inline void collect_procs_ksm(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static inline void collect_procs_ksm(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { } diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 78923015a2e8d..683a04088f3f2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -171,7 +171,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *folio_get_anon_vma(struct folio *folio); +struct anon_vma *folio_get_anon_vma(const struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; @@ -194,8 +194,8 @@ enum rmap_level { RMAP_LEVEL_PMD, }; -static inline void __folio_rmap_sanity_checks(struct folio *folio, - struct page *page, int nr_pages, enum rmap_level level) +static inline void __folio_rmap_sanity_checks(const struct folio *folio, + const struct page *page, int nr_pages, enum rmap_level level) { /* hugetlb folios are handled separately. */ VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); @@ -771,14 +771,14 @@ struct rmap_walk_control { bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, unsigned long addr, void *arg); int (*done)(struct folio *folio); - struct anon_vma *(*anon_lock)(struct folio *folio, + struct anon_vma *(*anon_lock)(const struct folio *folio, struct rmap_walk_control *rwc); bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); }; void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, +struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc); #else /* !CONFIG_MMU */ diff --git a/mm/internal.h b/mm/internal.h index 8674f677304a9..fd6373cb1c66d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1117,10 +1117,11 @@ void ClearPageHWPoisonTakenOff(struct page *page); bool take_page_off_buddy(struct page *page); bool put_page_back_buddy(struct page *page); struct task_struct *task_early_kill(struct task_struct *tsk, int force_early); -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, +void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long ksm_addr); -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); +unsigned long page_mapped_in_vma(const struct page *page, + struct vm_area_struct *vma); #else static inline void unmap_poisoned_folio(struct folio *folio, enum ttu_flags ttu) diff --git a/mm/ksm.c b/mm/ksm.c index b813225a806d5..7ac59cde626c9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1051,7 +1051,8 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) +static inline +struct ksm_stable_node *folio_stable_node(const struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } @@ -3067,7 +3068,7 @@ void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) /* * Collect processes when the error hit an ksm page. */ -void collect_procs_ksm(struct folio *folio, struct page *page, +void collect_procs_ksm(const struct folio *folio, const struct page *page, struct list_head *to_kill, int force_early) { struct ksm_stable_node *stable_node; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ea9d883c01c16..7ce7ba8586f5a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -445,7 +445,7 @@ static unsigned long dev_pagemap_mapping_shift(struct vm_area_struct *vma, * Schedule a process for later kill. * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. */ -static void __add_to_kill(struct task_struct *tsk, struct page *p, +static void __add_to_kill(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -461,7 +461,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, if (is_zone_device_page(p)) tk->size_shift = dev_pagemap_mapping_shift(vma, tk->addr); else - tk->size_shift = page_shift(compound_head(p)); + tk->size_shift = folio_shift(page_folio(p)); /* * Send SIGKILL if "tk->addr == -EFAULT". Also, as @@ -486,7 +486,7 @@ static void __add_to_kill(struct task_struct *tsk, struct page *p, list_add_tail(&tk->nd, to_kill); } -static void add_to_kill_anon_file(struct task_struct *tsk, struct page *p, +static void add_to_kill_anon_file(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -509,7 +509,7 @@ static bool task_in_to_kill_list(struct list_head *to_kill, return false; } -void add_to_kill_ksm(struct task_struct *tsk, struct page *p, +void add_to_kill_ksm(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, unsigned long addr) { @@ -606,8 +606,9 @@ struct task_struct *task_early_kill(struct task_struct *tsk, int force_early) /* * Collect processes when the error hit an anonymous page. */ -static void collect_procs_anon(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_anon(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { struct task_struct *tsk; struct anon_vma *av; @@ -643,8 +644,9 @@ static void collect_procs_anon(struct folio *folio, struct page *page, /* * Collect processes when the error hit a file mapped page. */ -static void collect_procs_file(struct folio *folio, struct page *page, - struct list_head *to_kill, int force_early) +static void collect_procs_file(const struct folio *folio, + const struct page *page, struct list_head *to_kill, + int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -680,7 +682,7 @@ static void collect_procs_file(struct folio *folio, struct page *page, } #ifdef CONFIG_FS_DAX -static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, +static void add_to_kill_fsdax(struct task_struct *tsk, const struct page *p, struct vm_area_struct *vma, struct list_head *to_kill, pgoff_t pgoff) { @@ -691,7 +693,7 @@ static void add_to_kill_fsdax(struct task_struct *tsk, struct page *p, /* * Collect processes when the error hit a fsdax page. */ -static void collect_procs_fsdax(struct page *page, +static void collect_procs_fsdax(const struct page *page, struct address_space *mapping, pgoff_t pgoff, struct list_head *to_kill, bool pre_remove) { @@ -725,7 +727,7 @@ static void collect_procs_fsdax(struct page *page, /* * Collect the processes who have the corrupted page mapped to kill. */ -static void collect_procs(struct folio *folio, struct page *page, +static void collect_procs(const struct folio *folio, const struct page *page, struct list_head *tokill, int force_early) { if (!folio->mapping) diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 6b356853c04e4..81839a9e74f16 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -337,9 +337,10 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) * outside the VMA or not present, returns -EFAULT. * Only valid for normal file or anonymous VMAs. */ -unsigned long page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +unsigned long page_mapped_in_vma(const struct page *page, + struct vm_area_struct *vma) { - struct folio *folio = page_folio(page); + const struct folio *folio = page_folio(page); struct page_vma_mapped_walk pvmw = { .pfn = page_to_pfn(page), .nr_pages = 1, diff --git a/mm/rmap.c b/mm/rmap.c index d4e5fe94fa922..c6c4d4ea29a7e 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -496,7 +496,7 @@ void __init anon_vma_init(void) * concurrently without folio lock protection). See folio_lock_anon_vma_read() * which has already covered that, and comment above remap_pages(). */ -struct anon_vma *folio_get_anon_vma(struct folio *folio) +struct anon_vma *folio_get_anon_vma(const struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; @@ -540,7 +540,7 @@ struct anon_vma *folio_get_anon_vma(struct folio *folio) * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ -struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, +struct anon_vma *folio_lock_anon_vma_read(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma = NULL; @@ -1271,8 +1271,9 @@ static void __folio_set_anon(struct folio *folio, struct vm_area_struct *vma, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped */ -static void __page_check_anon_rmap(struct folio *folio, struct page *page, - struct vm_area_struct *vma, unsigned long address) +static void __page_check_anon_rmap(const struct folio *folio, + const struct page *page, struct vm_area_struct *vma, + unsigned long address) { /* * The page's anon-rmap details (mapping and index) are guaranteed to @@ -2569,7 +2570,7 @@ void __put_anon_vma(struct anon_vma *anon_vma) anon_vma_free(root); } -static struct anon_vma *rmap_walk_anon_lock(struct folio *folio, +static struct anon_vma *rmap_walk_anon_lock(const struct folio *folio, struct rmap_walk_control *rwc) { struct anon_vma *anon_vma; From 0386aaa6e9c826bc494169a914e01a86befe6edf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:16 +0100 Subject: [PATCH 148/215] bootmem: stop using page->index Encode the type into the bottom four bits of page->private and the info into the remaining bits. Also turn the bootmem type into a named enum. [arnd@arndb.de: bootmem: add bootmem_type stub function] Link: https://lkml.kernel.org/r/20241015143802.577613-1-arnd@kernel.org [akpm@linux-foundation.org: fix build with !CONFIG_HAVE_BOOTMEM_INFO_NODE] Link: https://lore.kernel.org/oe-kbuild-all/202410090311.eaqcL7IZ-lkp@intel.com/ Link: https://lkml.kernel.org/r/20241005200121.3231142-6-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Arnd Bergmann Cc: kernel test robot Signed-off-by: Andrew Morton --- arch/x86/mm/init_64.c | 28 +++++++++++++++++++--------- include/linux/bootmem_info.h | 35 +++++++++++++++++++++++++++-------- mm/bootmem_info.c | 11 ++++++----- mm/sparse.c | 8 ++++---- 4 files changed, 56 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 5a564130b9d0e..01ea7c6df3036 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -985,22 +985,32 @@ int arch_add_memory(int nid, u64 start, u64 size, return add_pages(nid, start_pfn, nr_pages, params); } -static void __meminit free_pagetable(struct page *page, int order) +static void free_reserved_pages(struct page *page, unsigned long nr_pages) { - unsigned long magic; - unsigned int nr_pages = 1 << order; + while (nr_pages--) + free_reserved_page(page++); +} +static void __meminit free_pagetable(struct page *page, int order) +{ /* bootmem page has reserved flag */ if (PageReserved(page)) { - magic = page->index; - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) { + unsigned long nr_pages = 1 << order; +#ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE + enum bootmem_type type = bootmem_type(page); + + if (type == SECTION_INFO || type == MIX_SECTION_INFO) { while (nr_pages--) put_page_bootmem(page++); - } else - while (nr_pages--) - free_reserved_page(page++); - } else + } else { + free_reserved_pages(page, nr_pages); + } +#else + free_reserved_pages(page, nr_pages); +#endif + } else { free_pages((unsigned long)page_address(page), order); + } } static void __meminit free_hugepage_table(struct page *page, diff --git a/include/linux/bootmem_info.h b/include/linux/bootmem_info.h index cffa38a73618f..d8a8d245824a2 100644 --- a/include/linux/bootmem_info.h +++ b/include/linux/bootmem_info.h @@ -6,11 +6,10 @@ #include /* - * Types for free bootmem stored in page->lru.next. These have to be in - * some random range in unsigned long space for debugging purposes. + * Types for free bootmem stored in the low bits of page->private. */ -enum { - MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, +enum bootmem_type { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 1, SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, MIX_SECTION_INFO, NODE_INFO, @@ -21,9 +20,19 @@ enum { void __init register_page_bootmem_info_node(struct pglist_data *pgdat); void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type); + enum bootmem_type type); void put_page_bootmem(struct page *page); +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return (unsigned long)page->private & 0xf; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return (unsigned long)page->private >> 4; +} + /* * Any memory allocated via the memblock allocator and not via the * buddy will be marked reserved already in the memmap. For those @@ -31,7 +40,7 @@ void put_page_bootmem(struct page *page); */ static inline void free_bootmem_page(struct page *page) { - unsigned long magic = page->index; + enum bootmem_type type = bootmem_type(page); /* * The reserve_bootmem_region sets the reserved flag on bootmem @@ -39,7 +48,7 @@ static inline void free_bootmem_page(struct page *page) */ VM_BUG_ON_PAGE(page_ref_count(page) != 2, page); - if (magic == SECTION_INFO || magic == MIX_SECTION_INFO) + if (type == SECTION_INFO || type == MIX_SECTION_INFO) put_page_bootmem(page); else VM_BUG_ON_PAGE(1, page); @@ -53,8 +62,18 @@ static inline void put_page_bootmem(struct page *page) { } +static inline enum bootmem_type bootmem_type(const struct page *page) +{ + return SECTION_INFO; +} + +static inline unsigned long bootmem_info(const struct page *page) +{ + return 0; +} + static inline void get_page_bootmem(unsigned long info, struct page *page, - unsigned long type) + enum bootmem_type type) { } diff --git a/mm/bootmem_info.c b/mm/bootmem_info.c index fa7cb0c87c03f..95f288169a380 100644 --- a/mm/bootmem_info.c +++ b/mm/bootmem_info.c @@ -14,23 +14,24 @@ #include #include -void get_page_bootmem(unsigned long info, struct page *page, unsigned long type) +void get_page_bootmem(unsigned long info, struct page *page, + enum bootmem_type type) { - page->index = type; + BUG_ON(type > 0xf); + BUG_ON(info > (ULONG_MAX >> 4)); SetPagePrivate(page); - set_page_private(page, info); + set_page_private(page, info << 4 | type); page_ref_inc(page); } void put_page_bootmem(struct page *page) { - unsigned long type = page->index; + enum bootmem_type type = bootmem_type(page); BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (page_ref_dec_return(page) == 1) { - page->index = 0; ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); diff --git a/mm/sparse.c b/mm/sparse.c index 4cb9793f0b526..13b6624d35626 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -720,19 +720,19 @@ static void depopulate_section_memmap(unsigned long pfn, unsigned long nr_pages, static void free_map_bootmem(struct page *memmap) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic, nr_pages; + unsigned long type, nr_pages; struct page *page = virt_to_page(memmap); nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) >> PAGE_SHIFT; for (i = 0; i < nr_pages; i++, page++) { - magic = page->index; + type = bootmem_type(page); - BUG_ON(magic == NODE_INFO); + BUG_ON(type == NODE_INFO); maps_section_nr = pfn_to_section_nr(page_to_pfn(page)); - removing_section_nr = page_private(page); + removing_section_nr = bootmem_info(page); /* * When this function is called, the removing section is From 544ec0ed376486fae387c023390add32e68b58dd Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:17 +0100 Subject: [PATCH 149/215] mm: remove references to page->index in huge_memory.c We already have folios in all these places; it's just a matter of using them instead of the pages. Link: https://lkml.kernel.org/r/20241005200121.3231142-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 492c16eaf147a..61fc407330f23 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3199,8 +3199,8 @@ static void __split_huge_page_tail(struct folio *folio, int tail, /* ->mapping in first and second tail page is replaced by other uses */ VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, page_tail); - page_tail->mapping = head->mapping; - page_tail->index = head->index + tail; + new_folio->mapping = folio->mapping; + new_folio->index = folio->index + tail; /* * page->private should not be set in tail pages. Fix up and warn once @@ -3276,11 +3276,11 @@ static void __split_huge_page(struct page *page, struct list_head *list, ClearPageHasHWPoisoned(head); for (i = nr - new_nr; i >= new_nr; i -= new_nr) { + struct folio *tail; __split_huge_page_tail(folio, i, lruvec, list, new_order); + tail = page_folio(head + i); /* Some pages can be beyond EOF: drop them from page cache */ - if (head[i].index >= end) { - struct folio *tail = page_folio(head + i); - + if (tail->index >= end) { if (shmem_mapping(folio->mapping)) nr_dropped++; else if (folio_test_clear_dirty(tail)) @@ -3288,12 +3288,12 @@ static void __split_huge_page(struct page *page, struct list_head *list, inode_to_wb(folio->mapping->host)); __filemap_remove_folio(tail, NULL); folio_put(tail); - } else if (!PageAnon(page)) { - __xa_store(&folio->mapping->i_pages, head[i].index, - head + i, 0); + } else if (!folio_test_anon(folio)) { + __xa_store(&folio->mapping->i_pages, tail->index, + tail, 0); } else if (swap_cache) { __xa_store(&swap_cache->i_pages, offset + i, - head + i, 0); + tail, 0); } } From 33d7f15f916ea50e9d29b805fcfdbb9e930a742a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Sat, 5 Oct 2024 21:01:18 +0100 Subject: [PATCH 150/215] mm: use page->private instead of page->index in percpu The percpu allocator only uses one field in struct page, just change it from page->index to page->private. Link: https://lkml.kernel.org/r/20241005200121.3231142-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/percpu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index d1a73cf65c532..d8dd31a2e407d 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -253,13 +253,13 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) /* set the pointer to a chunk in a page struct */ static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { - page->index = (unsigned long)pcpu; + page->private = (unsigned long)pcpu; } /* obtain pointer to a chunk from a page struct */ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { - return (struct pcpu_chunk *)page->index; + return (struct pcpu_chunk *)page->private; } static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) From 1bc542c6a0d1444559ab75823a89a94d244bf933 Mon Sep 17 00:00:00 2001 From: Zeng Jingxiang Date: Sat, 26 Oct 2024 19:57:14 +0800 Subject: [PATCH 151/215] mm/vmscan: wake up flushers conditionally to avoid cgroup OOM Commit 14aa8b2d5c2e ("mm/mglru: don't sync disk for each aging cycle") removed the opportunity to wake up flushers during the MGLRU page reclamation process can lead to an increased likelihood of triggering OOM when encountering many dirty pages during reclamation on MGLRU. This leads to premature OOM if there are too many dirty pages in cgroup: Killed dd invoked oom-killer: gfp_mask=0x101cca(GFP_HIGHUSER_MOVABLE|__GFP_WRITE), order=0, oom_score_adj=0 Call Trace: dump_stack_lvl+0x5f/0x80 dump_stack+0x14/0x20 dump_header+0x46/0x1b0 oom_kill_process+0x104/0x220 out_of_memory+0x112/0x5a0 mem_cgroup_out_of_memory+0x13b/0x150 try_charge_memcg+0x44f/0x5c0 charge_memcg+0x34/0x50 __mem_cgroup_charge+0x31/0x90 filemap_add_folio+0x4b/0xf0 __filemap_get_folio+0x1a4/0x5b0 ? srso_return_thunk+0x5/0x5f ? __block_commit_write+0x82/0xb0 ext4_da_write_begin+0xe5/0x270 generic_perform_write+0x134/0x2b0 ext4_buffered_write_iter+0x57/0xd0 ext4_file_write_iter+0x76/0x7d0 ? selinux_file_permission+0x119/0x150 ? srso_return_thunk+0x5/0x5f ? srso_return_thunk+0x5/0x5f vfs_write+0x30c/0x440 ksys_write+0x65/0xe0 __x64_sys_write+0x1e/0x30 x64_sys_call+0x11c2/0x1d50 do_syscall_64+0x47/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e memory: usage 308224kB, limit 308224kB, failcnt 2589 swap: usage 0kB, limit 9007199254740988kB, failcnt 0 ... file_dirty 303247360 file_writeback 0 ... oom-kill:constraint=CONSTRAINT_MEMCG,nodemask=(null),cpuset=test, mems_allowed=0,oom_memcg=/test,task_memcg=/test,task=dd,pid=4404,uid=0 Memory cgroup out of memory: Killed process 4404 (dd) total-vm:10512kB, anon-rss:1152kB, file-rss:1824kB, shmem-rss:0kB, UID:0 pgtables:76kB oom_score_adj:0 The flusher wake up was removed to decrease SSD wearing, but if we are seeing all dirty folios at the tail of an LRU, not waking up the flusher could lead to thrashing easily. So wake it up when a memcg is about to OOM due to dirty caches. I did run the build kernel test[1] on V6, with -j16 1G memcg on my local branch: Without the patch(10 times): user 1449.394 system 368.78 372.58 363.03 362.31 360.84 372.70 368.72 364.94 373.51 366.58 (avg 367.399) real 164.883 With the V6 patch(10 times): user 1447.525 system 360.87 360.63 372.39 364.09 368.49 365.15 359.93 362.04 359.72 354.60 (avg 362.79) real 164.514 Test results show that this patch has about 1% performance improvement, which should be caused by noise. Link: https://lkml.kernel.org/r/20241026115714.1437435-1-jingxiangzeng.cas@gmail.com Link: https://lore.kernel.org/all/CACePvbV4L-gRN9UKKuUnksfVJjOTq_5Sti2-e=pb_w51kucLKQ@mail.gmail.com/ [1] Fixes: 14aa8b2d5c2e ("mm/mglru: don't sync disk for each aging cycle") Suggested-by: Wei Xu Signed-off-by: Zeng Jingxiang Signed-off-by: Kairui Song Reviewed-by: Wei Xu Tested-by: Chris Li Cc: T.J. Mercier Cc: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index caba8e811ec55..76378bc257e38 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4284,6 +4284,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int tier_idx) { bool success; + bool dirty, writeback; int gen = folio_lru_gen(folio); int type = folio_is_file_lru(folio); int zone = folio_zonenum(folio); @@ -4329,9 +4330,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c return true; } + dirty = folio_test_dirty(folio); + writeback = folio_test_writeback(folio); + if (type == LRU_GEN_FILE && dirty) { + sc->nr.file_taken += delta; + if (!writeback) + sc->nr.unqueued_dirty += delta; + } + /* waiting for writeback */ - if (folio_test_locked(folio) || folio_test_writeback(folio) || - (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + if (folio_test_locked(folio) || writeback || + (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4447,7 +4456,8 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, trace_mm_vmscan_lru_isolate(sc->reclaim_idx, sc->order, MAX_LRU_BATCH, scanned, skipped, isolated, type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); - + if (type == LRU_GEN_FILE) + sc->nr.file_taken += isolated; /* * There might not be eligible folios due to reclaim_idx. Check the * remaining to prevent livelock if it's not making progress. @@ -4581,6 +4591,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap return scanned; retry: reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); + sc->nr.unqueued_dirty += stat.nr_unqueued_dirty; sc->nr_reclaimed += reclaimed; trace_mm_vmscan_lru_shrink_inactive(pgdat->node_id, scanned, reclaimed, &stat, sc->priority, @@ -4789,6 +4800,13 @@ static bool try_to_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) cond_resched(); } + /* + * If too many file cache in the coldest generation can't be evicted + * due to being dirty, wake up the flusher. + */ + if (sc->nr.unqueued_dirty && sc->nr.unqueued_dirty == sc->nr.file_taken) + wakeup_flusher_threads(WB_REASON_VMSCAN); + /* whether this lruvec should be rotated */ return nr_to_scan < 0; } @@ -5934,6 +5952,7 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) bool reclaimable = false; if (lru_gen_enabled() && root_reclaim(sc)) { + memset(&sc->nr, 0, sizeof(sc->nr)); lru_gen_shrink_node(pgdat, sc); return; } From e8c1a296b8066734ef20797ab77e03a90b0c9be8 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sat, 26 Oct 2024 12:35:53 +0200 Subject: [PATCH 152/215] mm/show_mem: use str_yes_no() helper in show_free_areas() Remove hard-coded strings by using the str_yes_no() helper function. Link: https://lkml.kernel.org/r/20241026103552.6790-2-thorsten.blum@linux.dev Signed-off-by: Thorsten Blum Signed-off-by: Andrew Morton --- mm/show_mem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/show_mem.c b/mm/show_mem.c index ec885a398fa09..43afb56abbd3e 100644 --- a/mm/show_mem.c +++ b/mm/show_mem.c @@ -285,8 +285,7 @@ static void show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_z #endif K(node_page_state(pgdat, NR_PAGETABLE)), K(node_page_state(pgdat, NR_SECONDARY_PAGETABLE)), - pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? - "yes" : "no"); + str_yes_no(pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)); } for_each_populated_zone(zone) { From 2b1d55498b67ef59bd461236306fa24ae79878e5 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Sat, 26 Oct 2024 09:34:07 +0000 Subject: [PATCH 153/215] memcg: factor out mem_cgroup_stat_aggregate() Currently mem_cgroup_css_rstat_flush() is used to flush the per-CPU statistics from a specified CPU into the global statistics of the memcg. It processes three kinds of data in three for loops using exactly the same method. Therefore, the for loop can be factored out and may make the code more clean. Link: https://lkml.kernel.org/r/20241026093407.310955-1-xiujianfeng@huaweicloud.com Signed-off-by: Xiu Jianfeng Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Cc: Wang Weiyang Signed-off-by: Andrew Morton --- mm/memcontrol.c | 129 ++++++++++++++++++++++++++---------------------- 1 file changed, 70 insertions(+), 59 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 39e902c1dd9fd..bc0033a2aa3ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3730,68 +3730,90 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) memcg_wb_domain_size_changed(memcg); } -static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +struct aggregate_control { + /* pointer to the aggregated (CPU and subtree aggregated) counters */ + long *aggregate; + /* pointer to the non-hierarchichal (CPU aggregated) counters */ + long *local; + /* pointer to the pending child counters during tree propagation */ + long *pending; + /* pointer to the parent's pending counters, could be NULL */ + long *ppending; + /* pointer to the percpu counters to be aggregated */ + long *cstat; + /* pointer to the percpu counters of the last aggregation*/ + long *cstat_prev; + /* size of the above counters */ + int size; +}; + +static void mem_cgroup_stat_aggregate(struct aggregate_control *ac) { - struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - struct memcg_vmstats_percpu *statc; + int i; long delta, delta_cpu, v; - int i, nid; - - statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - for (i = 0; i < MEMCG_VMSTAT_SIZE; i++) { + for (i = 0; i < ac->size; i++) { /* * Collect the aggregated propagation counts of groups * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats->state_pending[i]; + delta = ac->pending[i]; if (delta) - memcg->vmstats->state_pending[i] = 0; + ac->pending[i] = 0; /* Add CPU changes on this level since the last flush */ delta_cpu = 0; - v = READ_ONCE(statc->state[i]); - if (v != statc->state_prev[i]) { - delta_cpu = v - statc->state_prev[i]; + v = READ_ONCE(ac->cstat[i]); + if (v != ac->cstat_prev[i]) { + delta_cpu = v - ac->cstat_prev[i]; delta += delta_cpu; - statc->state_prev[i] = v; + ac->cstat_prev[i] = v; } /* Aggregate counts on this level and propagate upwards */ if (delta_cpu) - memcg->vmstats->state_local[i] += delta_cpu; + ac->local[i] += delta_cpu; if (delta) { - memcg->vmstats->state[i] += delta; - if (parent) - parent->vmstats->state_pending[i] += delta; + ac->aggregate[i] += delta; + if (ac->ppending) + ac->ppending[i] += delta; } } +} - for (i = 0; i < NR_MEMCG_EVENTS; i++) { - delta = memcg->vmstats->events_pending[i]; - if (delta) - memcg->vmstats->events_pending[i] = 0; - - delta_cpu = 0; - v = READ_ONCE(statc->events[i]); - if (v != statc->events_prev[i]) { - delta_cpu = v - statc->events_prev[i]; - delta += delta_cpu; - statc->events_prev[i] = v; - } +static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(css); + struct mem_cgroup *parent = parent_mem_cgroup(memcg); + struct memcg_vmstats_percpu *statc; + struct aggregate_control ac; + int nid; - if (delta_cpu) - memcg->vmstats->events_local[i] += delta_cpu; + statc = per_cpu_ptr(memcg->vmstats_percpu, cpu); - if (delta) { - memcg->vmstats->events[i] += delta; - if (parent) - parent->vmstats->events_pending[i] += delta; - } - } + ac = (struct aggregate_control) { + .aggregate = memcg->vmstats->state, + .local = memcg->vmstats->state_local, + .pending = memcg->vmstats->state_pending, + .ppending = parent ? parent->vmstats->state_pending : NULL, + .cstat = statc->state, + .cstat_prev = statc->state_prev, + .size = MEMCG_VMSTAT_SIZE, + }; + mem_cgroup_stat_aggregate(&ac); + + ac = (struct aggregate_control) { + .aggregate = memcg->vmstats->events, + .local = memcg->vmstats->events_local, + .pending = memcg->vmstats->events_pending, + .ppending = parent ? parent->vmstats->events_pending : NULL, + .cstat = statc->events, + .cstat_prev = statc->events_prev, + .size = NR_MEMCG_EVENTS, + }; + mem_cgroup_stat_aggregate(&ac); for_each_node_state(nid, N_MEMORY) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[nid]; @@ -3804,28 +3826,17 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) lstatc = per_cpu_ptr(pn->lruvec_stats_percpu, cpu); - for (i = 0; i < NR_MEMCG_NODE_STAT_ITEMS; i++) { - delta = lstats->state_pending[i]; - if (delta) - lstats->state_pending[i] = 0; - - delta_cpu = 0; - v = READ_ONCE(lstatc->state[i]); - if (v != lstatc->state_prev[i]) { - delta_cpu = v - lstatc->state_prev[i]; - delta += delta_cpu; - lstatc->state_prev[i] = v; - } - - if (delta_cpu) - lstats->state_local[i] += delta_cpu; + ac = (struct aggregate_control) { + .aggregate = lstats->state, + .local = lstats->state_local, + .pending = lstats->state_pending, + .ppending = plstats ? plstats->state_pending : NULL, + .cstat = lstatc->state, + .cstat_prev = lstatc->state_prev, + .size = NR_MEMCG_NODE_STAT_ITEMS, + }; + mem_cgroup_stat_aggregate(&ac); - if (delta) { - lstats->state[i] += delta; - if (plstats) - plstats->state_pending[i] += delta; - } - } } WRITE_ONCE(statc->stats_updates, 0); /* We are in a per-cpu loop here, only do the atomic write once */ From 45488345d4b60f5c3a0a5d78fee76f0f3be896b4 Mon Sep 17 00:00:00 2001 From: Andrew Paniakin Date: Mon, 28 Oct 2024 16:30:53 -0700 Subject: [PATCH 154/215] selftests/damon/huge_count_read_write: provide sufficiently large buffer for DEPRECATED file read Patch series "damon/{self,kunit}tests: minor fixups for DAMON debugfs interface tests". Fixup small broken window panes in DAMON selftests and kunit tests. First four patches clean up DAMON debugfs interface selftests output, by fixing segmentation fault of a test program (patch 1), removing unnecessary debugging messages (patch 2), and hiding error messages from expected failures (patches 3 and 4). Following two patches fix copy-paste mistakes in DAMON Kconfig help message that copied from debugfs kunit test (patch 5) and a comment on the debugfs kunit test code (patch 6). This patch (of 6): 'huge_count_read_write' crashes with segmentation fault when reading DEPRECATED file of DAMON debugfs interface. This is not causing any problem for users or other tests because the purpose of the test is just ensuring the read is not causing kernel warning messages. Nonetheless, it makes the output unnecessarily noisy, and the DEPRECATED file is not properly being tested. It happens because the size of the content of the file is larger than the size of the buffer for the read. The file contains about 170 characters. Increase the buffer size to 256 characters. Link: https://lkml.kernel.org/r/20241028233058.283381-1-sj@kernel.org Link: https://lkml.kernel.org/r/20241028233058.283381-2-sj@kernel.org Fixes: b4a002889d24 ("selftests/damon: test debugfs file reads/writes with huge count") Signed-off-by: Andrew Paniakin Signed-off-by: SeongJae Park Cc: Andrew Panyakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/huge_count_read_write.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c index a6fe0689f88dc..f3c199dc8eba2 100644 --- a/tools/testing/selftests/damon/huge_count_read_write.c +++ b/tools/testing/selftests/damon/huge_count_read_write.c @@ -18,7 +18,7 @@ void write_read_with_huge_count(char *file) { int filedesc = open(file, O_RDWR); - char buf[25]; + char buf[256]; int ret; printf("%s %s\n", __func__, file); From e06a6b55ed3db832cb8fbbc2df38b367dbab51ed Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 28 Oct 2024 16:30:54 -0700 Subject: [PATCH 155/215] selftests/damon/huge_count_read_write: remove unnecessary debugging message The program prints expected errors from write/read of the files with invalid huge count, for only debugging purpose. It is only making the output noisy. Remove those. Link: https://lkml.kernel.org/r/20241028233058.283381-3-sj@kernel.org Fixes: b4a002889d24 ("selftests/damon: test debugfs file reads/writes with huge count") Signed-off-by: SeongJae Park Cc: Andrew Paniakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/huge_count_read_write.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/tools/testing/selftests/damon/huge_count_read_write.c b/tools/testing/selftests/damon/huge_count_read_write.c index f3c199dc8eba2..53e69a669668a 100644 --- a/tools/testing/selftests/damon/huge_count_read_write.c +++ b/tools/testing/selftests/damon/huge_count_read_write.c @@ -28,9 +28,7 @@ void write_read_with_huge_count(char *file) } write(filedesc, "", 0xfffffffful); - perror("after write: "); ret = read(filedesc, buf, 0xfffffffful); - perror("after read: "); close(filedesc); } From 82475d111de73b5688389d2736509bf30cb338d8 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 28 Oct 2024 16:30:55 -0700 Subject: [PATCH 156/215] selftests/damon/_debugfs_common: hide expected error message from test_write_result() DAMON debugfs interface selftests use test_write_result() to check if valid or invalid writes to files of the interface success or fail as expected. File write error messages from expected failures are only making the output noisy. Hide such expected error messages. Link: https://lkml.kernel.org/r/20241028233058.283381-4-sj@kernel.org Fixes: b348eb7abd09 ("mm/damon: add user space selftests") Signed-off-by: SeongJae Park Cc: Andrew Paniakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/_debugfs_common.sh | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/_debugfs_common.sh b/tools/testing/selftests/damon/_debugfs_common.sh index aa995516870bc..54d45791b0d9c 100644 --- a/tools/testing/selftests/damon/_debugfs_common.sh +++ b/tools/testing/selftests/damon/_debugfs_common.sh @@ -8,7 +8,12 @@ test_write_result() { expect_reason=$4 expected=$5 - echo "$content" > "$file" + if [ "$expected" = "0" ] + then + echo "$content" > "$file" + else + echo "$content" > "$file" 2> /dev/null + fi if [ $? -ne "$expected" ] then echo "writing $content to $file doesn't return $expected" From 9b1266ee08c2e45684d58a53a48866a230c76bff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 28 Oct 2024 16:30:56 -0700 Subject: [PATCH 157/215] selftests/damon/debugfs_duplicate_context_creation: hide errors from expected file write failures debugfs_duplicate_context_creation.sh does an invalid file write to ensure it fails. Check of the failure is sufficient, so the error message from the failure only makes the output unnecessarily noisy. Hide it. Link: https://lkml.kernel.org/r/20241028233058.283381-5-sj@kernel.org Fixes: ade38b8ca5ce ("selftest/damon: add a test for duplicate context dirs creation") Signed-off-by: SeongJae Park Cc: Andrew Paniakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- .../selftests/damon/debugfs_duplicate_context_creation.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh index 4a76e37ef16b1..bd6c22d96ead3 100755 --- a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -12,7 +12,7 @@ then exit 1 fi -if echo foo > "$DBGFS/mk_contexts" +if echo foo > "$DBGFS/mk_contexts" 2> /dev/null then echo "duplicate context creation success" exit 1 From 12d021659c7aa5059835e671e57c4a163a64d2e9 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 28 Oct 2024 16:30:57 -0700 Subject: [PATCH 158/215] mm/damon/Kconfig: update DBGFS_KUNIT prompt copy for SYSFS_KUNIT CONFIG_DAMON_SYSFS_KUNIT_TEST prompt is copied from that for DAMON debugfs interface kunit tests, and not correctly updated. Fix it. Link: https://lkml.kernel.org/r/20241028233058.283381-6-sj@kernel.org Fixes: b8ee5575f763 ("mm/damon/sysfs-test: add a unit test for damon_sysfs_set_targets()") Signed-off-by: SeongJae Park Cc: Andrew Paniakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 35b72f88983a2..d0357f3e93721 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -60,7 +60,7 @@ config DAMON_SYSFS the interface for arbitrary data access monitoring. config DAMON_SYSFS_KUNIT_TEST - bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS + bool "Test for damon sysfs interface" if !KUNIT_ALL_TESTS depends on DAMON_SYSFS && KUNIT=y default KUNIT_ALL_TESTS help From 73da523802eafafe7e55a5e8a8bc8ee3f5cf3b9b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Mon, 28 Oct 2024 16:30:58 -0700 Subject: [PATCH 159/215] mm/damon/tests/dbgfs-kunit: fix the header double inclusion guarding ifdef comment Closing part of double inclusion guarding macro for dbgfs-kunit.h was copy-pasted from somewhere (maybe before the initial mainline merge of DAMON), and not properly updated. Fix it. Link: https://lkml.kernel.org/r/20241028233058.283381-7-sj@kernel.org Fixes: 17ccae8bb5c9 ("mm/damon: add kunit tests") Signed-off-by: SeongJae Park Cc: Andrew Paniakin Cc: Brendan Higgins Cc: David Gow Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/tests/dbgfs-kunit.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/tests/dbgfs-kunit.h b/mm/damon/tests/dbgfs-kunit.h index d2ecfcc8db861..087e53f641a81 100644 --- a/mm/damon/tests/dbgfs-kunit.h +++ b/mm/damon/tests/dbgfs-kunit.h @@ -168,6 +168,6 @@ static struct kunit_suite damon_test_suite = { }; kunit_test_suite(damon_test_suite); -#endif /* _DAMON_TEST_H */ +#endif /* _DAMON_DBGFS_TEST_H */ #endif /* CONFIG_DAMON_KUNIT_TEST */ From 69bad21551c9caea8c58800f96da48a704fd311e Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:16 -0700 Subject: [PATCH 160/215] mm: define obj_cgroup_get() if CONFIG_MEMCG is not defined Patch series "mm: zswap swap-out of large folios", v10. This patch series enables zswap_store() to accept and store large folios. The most significant contribution in this series is from the earlier RFC submitted by Ryan Roberts [1]. Ryan's original RFC has been migrated to mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based on code review comments received for the current patch-series. [1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u The first few patches do the prep work for supporting large folios in zswap_store. Patch 6 provides the main functionality to swap-out large folios in zswap. Patch 7 adds sysfs per-order hugepages "zswpout" counters that get incremented upon successful zswap_store of large folios, and also updates the documentation for this: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout This patch series is a prerequisite for zswap compress batching of large folio swap-out and decompress batching of swap-ins based on swapin_readahead(), using Intel IAA hardware acceleration, which we would like to submit in subsequent patch-series, with performance improvement data. Thanks to Ying Huang for pre-posting review feedback and suggestions! Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and Matthew for their helpful feedback, code/data reviews and suggestions! Co-development signoff request: =============================== I would like to thank Ryan Roberts for his original RFC [1] and request his co-developer signoff on patch 6 in this series. Thanks Ryan! System setup for testing: ========================= Testing of this patch series was done with mm-unstable as of 9-27-2024, commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series) and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568 (with this patch-series). Data was gathered on an Intel Sapphire Rapids server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz. The vm-scalability "usemem" test was run in a cgroup whose memory.high was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem processes were run, each allocating and writing 10G of memory, and sleeping for 10 sec before exiting: usemem --init-time -w -O -s 10 -n 30 10g Other kernel configuration parameters: zswap compressors : zstd, deflate-iaa zswap allocator : zsmalloc vm.page-cluster : 2 In the experiments where "deflate-iaa" is used as the zswap compressor, IAA "compression verification" is enabled by default (cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA compression will be decompressed internally by the "iaa_crypto" driver, the crc-s returned by the hardware will be compared and errors reported in case of mismatches. Thus "deflate-iaa" helps ensure better data integrity as compared to the software compressors, and the experimental data listed below is with verify_compress set to "1". Metrics reporting methodology: ============================== Total and average throughput are derived from the individual 30 processes' throughputs reported by usemem. elapsed/sys times are measured with perf. All percentage changes are "new" vs. "old"; hence a positive value denotes an increase in the metric, whether it is throughput or latency, and a negative value denotes a reduction in the metric. Positive throughput change percentages and negative latency change percentages denote improvements. The vm stats and sysfs hugepages stats included with the performance data provide details on the swapout activity to zswap/swap device. Testing labels used in data summaries: ====================================== The data refers to these test configurations and the before/after comparisons that they do: before-case1: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K) In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split into 4K folios that get processed by zswap. before-case2: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios) In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large folios, which will then be stored by the SSD swap device. after: ------ v10 of this patch-series, CONFIG_THP_SWAP=Y The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results in 64K/2M folios to not be split, and to be processed by zswap_store. Regression Testing: =================== I ran vm-scalability usemem without large folios, i.e., only 4K folios with mm-unstable and this patch-series. The main goal was to make sure that there is no functional or performance regression wrt the earlier zswap behavior for 4K folios, now that 4K folios will be processed by the new zswap_store() code. The data indicates there is no significant regression. ------------------------------------------------------------------------------- 4K folios: ========== zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 4,793,363 4,880,978 4,853,074 1% -1% Average throughput (KB/s) 159,778 162,699 161,769 1% -1% elapsed time (sec) 130.14 123.17 126.29 -3% 3% sys time (sec) 3,135.53 2,985.64 3,083.18 -2% 3% memcg_high 446,826 444,626 452,930 memcg_swap_fail 0 0 0 zswpout 48,932,107 48,931,971 48,931,820 zswpin 383 386 397 pswpout 0 0 0 pswpin 0 0 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 0 0 0 pgmajfault 3,063 3,077 3,479 swap_ra 93 94 96 swap_ra_hit 47 47 50 ZSWPOUT-64kB n/a n/a 0 SWPOUT-64kB 0 0 0 ------------------------------------------------------------------------------- Performance Testing: ==================== We list the data for 64K folios with before/after data per-compressor, followed by the same for 2M pmd-mappable folios. ------------------------------------------------------------------------------- 64K folios: zstd: ================= zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,222,213 1,076,611 6,159,776 18% 472% Average throughput (KB/s) 174,073 35,887 205,325 18% 472% elapsed time (sec) 120.50 347.16 108.33 -10% -69% sys time (sec) 2,930.33 248.16 2,549.65 -13% 927% memcg_high 416,773 552,200 465,874 memcg_swap_fail 3,192,906 1,293 1,012 zswpout 48,931,583 20,903 48,931,218 zswpin 384 363 410 pswpout 0 40,778,448 0 pswpin 0 16 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 3,192,906 1,293 1,012 pgmajfault 3,452 3,072 3,061 swap_ra 90 87 107 swap_ra_hit 42 43 57 ZSWPOUT-64kB n/a n/a 3,057,173 SWPOUT-64kB 0 2,548,653 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 64K folios: deflate-iaa: ======================== zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,652,608 1,089,180 7,189,778 27% 560% Average throughput (KB/s) 188,420 36,306 239,659 27% 560% elapsed time (sec) 102.90 343.35 87.05 -15% -75% sys time (sec) 2,246.86 213.53 1,864.16 -17% 773% memcg_high 576,104 502,907 642,083 memcg_swap_fail 4,016,117 1,407 1,478 zswpout 61,163,423 22,444 57,798,716 zswpin 401 368 454 pswpout 0 40,862,080 0 pswpin 0 20 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 4,016,117 1,407 1,478 pgmajfault 3,063 3,153 3,122 swap_ra 96 93 156 swap_ra_hit 46 45 83 ZSWPOUT-64kB n/a n/a 3,611,032 SWPOUT-64kB 0 2,553,880 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: zstd: ================ zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,895,500 1,109,694 6,484,224 10% 484% Average throughput (KB/s) 196,516 36,989 216,140 10% 484% elapsed time (sec) 108.77 334.28 106.33 -2% -68% sys time (sec) 2,657.14 94.88 2,376.13 -11% 2404% memcg_high 64,200 66,316 56,898 memcg_swap_fail 101,182 70 27 zswpout 48,931,499 36,507 48,890,640 zswpin 380 379 377 pswpout 0 40,166,400 0 pswpin 0 0 0 thp_swpout 0 78,450 0 thp_swpout_fallback 101,182 70 27 2MB-mthp_swpout_fallback 0 0 27 pgmajfault 3,067 3,417 3,311 swap_ra 91 90 854 swap_ra_hit 45 45 810 ZSWPOUT-2MB n/a n/a 95,459 SWPOUT-2MB 0 78,450 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: deflate-iaa: ======================= zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 6,286,587 1,126,785 7,073,464 13% 528% Average throughput (KB/s) 209,552 37,559 235,782 13% 528% elapsed time (sec) 96.19 333.03 85.79 -11% -74% sys time (sec) 2,141.44 99.96 1,826.67 -15% 1727% memcg_high 99,253 64,666 79,718 memcg_swap_fail 129,074 53 165 zswpout 61,312,794 28,321 56,045,120 zswpin 383 406 403 pswpout 0 40,048,128 0 pswpin 0 0 0 thp_swpout 0 78,219 0 thp_swpout_fallback 129,074 53 165 2MB-mthp_swpout_fallback 0 0 165 pgmajfault 3,430 3,077 31,468 swap_ra 91 103 84,373 swap_ra_hit 47 46 84,317 ZSWPOUT-2MB n/a n/a 109,229 SWPOUT-2MB 0 78,219 0 ------------------------------------------------------------------------------- And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this patch-series: --------------------------------------------- zswap_store large folios v10 Impr w/ deflate-iaa vs. zstd 64K folios 2M folios --------------------------------------------- Throughput (KB/s) 17% 9% elapsed time (sec) -20% -19% sys time (sec) -27% -23% --------------------------------------------- Conclusions based on the performance results: ============================================= v10 wrt before-case1: --------------------- We see significant improvements in throughput, elapsed and sys time for zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after (THP_SWAP=Y) with zswap_store large folios. v10 wrt before-case2: --------------------- We see even more significant improvements in throughput and elapsed time for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD) vs. after (large-folio-zswap). The sys time increases with large-folio-zswap as expected, due to the CPU compression time vs. asynchronous disk write times, as pointed out by Ying and Yosry. In before-case2, when zswap does not store large folios, only allocations and cgroup charging due to 4K folio zswap stores count towards the cgroup memory limit. However, in the after scenario, with the introduction of zswap_store() of large folios, there is an added component of the zswap compressed pool usage from large folio stores from potentially all 30 processes, that gets counted towards the memory limit. As a result, we see higher swapout activity in the "after" data. Summary: ======== The v10 data presented above shows that zswap_store of large folios demonstrates good throughput/performance improvements compared to conventional SSD swap of large folios with a sufficiently large 525G SSD swap device. Hence, it seems reasonable for zswap_store to support large folios, so that further performance improvements can be implemented. In the experimental setup used in this patchset, we have enabled IAA compress verification to ensure additional hardware data integrity CRC checks not currently done by the software compressors. We see good throughput/latency improvements with deflate-iaa vs. zstd with zswap_store of large folios. Some of the ideas for further reducing latency that have shown promise in our experiments, are: 1) IAA compress/decompress batching. 2) Distributing compress jobs across all IAA devices on the socket. The tests run for this patchset are using only 1 IAA device per core, that avails of 2 compress engines on the device. In our experiments with IAA batching, we distribute compress jobs from all cores to the 8 compress engines available per socket. We further compress the pages in each folio in parallel in the accelerator. As a result, we improve compress latency and reclaim throughput. In decompress batching, we use swapin_readahead to generate a prefetch batch of 4K folios that we decompress in parallel in IAA. ------------------------------------------------------------------------------ IAA compress/decompress batching Further improvements wrt v10 zswap_store Sequential subpage store using "deflate-iaa": "deflate-iaa" Batching "deflate-iaa-canned" [2] Batching Additional Impr Additional Impr 64K folios 2M folios 64K folios 2M folios ------------------------------------------------------------------------------ Throughput (KB/s) 19% 43% 26% 55% elapsed time (sec) -5% -14% -10% -21% sys time (sec) 4% -7% -4% -18% ------------------------------------------------------------------------------ With zswap IAA compress/decompress batching, we are able to demonstrate significant performance improvements and memory savings in server scalability experiments in highly contended system scenarios under significant memory pressure; as compared to software compressors. We hope to submit this work in subsequent patch series. The current patch-series is a prequisite for these future submissions. [1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u [2] https://patchwork.kernel.org/project/linux-crypto/cover/cover.1710969449.git.andre.glover@linux.intel.com/ This patch (of 6): This resolves an issue with obj_cgroup_get() not being defined if CONFIG_MEMCG is not defined. Before this patch, we would see build errors if obj_cgroup_get() is called from code that is agnostic of CONFIG_MEMCG. The zswap_store() changes for large folios in subsequent commits will require the use of obj_cgroup_get() in zswap code that falls into this category. Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241001053222.6944-2-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Reviewed-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0bd8f61a5597e..5502aa8e138ec 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1233,6 +1233,10 @@ struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *css) return NULL; } +static inline void obj_cgroup_get(struct obj_cgroup *objcg) +{ +} + static inline void obj_cgroup_put(struct obj_cgroup *objcg) { } From 3d0f560a367ee2bc9ec369f5e844d8116d850f1c Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:17 -0700 Subject: [PATCH 161/215] mm: zswap: modify zswap_compress() to accept a page instead of a folio For zswap_store() to be able to store a large folio by compressing it one page at a time, zswap_compress() needs to accept a page as input. This will allow us to iterate through each page in the folio in zswap_store(), compress it and store it in the zpool. Link: https://lkml.kernel.org/r/20241001053222.6944-3-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a400324a45a7b..a62a84b66bd72 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -875,7 +875,7 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) +static bool zswap_compress(struct page *page, struct zswap_entry *entry) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -893,7 +893,7 @@ static bool zswap_compress(struct folio *folio, struct zswap_entry *entry) dst = acomp_ctx->buffer; sg_init_table(&input, 1); - sg_set_folio(&input, folio, PAGE_SIZE, 0); + sg_set_page(&input, page, PAGE_SIZE, 0); /* * We need PAGE_SIZE * 2 here since there maybe over-compression case, @@ -1457,7 +1457,7 @@ bool zswap_store(struct folio *folio) mem_cgroup_put(memcg); } - if (!zswap_compress(folio, entry)) + if (!zswap_compress(&folio->page, entry)) goto put_pool; entry->swpentry = swp; From 0201c054c2a38c53e8949700468ae91623f8cea9 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:18 -0700 Subject: [PATCH 162/215] mm: zswap: rename zswap_pool_get() to zswap_pool_tryget() Modify the name of the existing zswap_pool_get() to zswap_pool_tryget() to be representative of the call it makes to percpu_ref_tryget(). A subsequent patch will introduce a new zswap_pool_get() that calls percpu_ref_get(). The intent behind this change is for higher level zswap API such as zswap_store() to call zswap_pool_tryget() to check upfront if the pool's refcount is "0" (which means it could be getting destroyed) and to handle this as an error condition. zswap_store() would proceed only if zswap_pool_tryget() returns success, and any additional pool refcounts that need to be obtained for compressing sub-pages in a large folio could simply call zswap_pool_get(). Link: https://lkml.kernel.org/r/20241001053222.6944-4-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Acked-by: Yosry Ahmed Reviewed-by: Chengming Zhou Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- mm/zswap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index a62a84b66bd72..bcb1b9cc9645b 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -402,7 +402,7 @@ static void __zswap_pool_empty(struct percpu_ref *ref) spin_unlock_bh(&zswap_pools_lock); } -static int __must_check zswap_pool_get(struct zswap_pool *pool) +static int __must_check zswap_pool_tryget(struct zswap_pool *pool) { if (!pool) return 0; @@ -440,7 +440,7 @@ static struct zswap_pool *zswap_pool_current_get(void) rcu_read_lock(); pool = __zswap_pool_current(); - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) pool = NULL; rcu_read_unlock(); @@ -461,7 +461,7 @@ static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor) if (strcmp(zpool_get_type(pool->zpool), type)) continue; /* if we can't get it, it's about to be destroyed */ - if (!zswap_pool_get(pool)) + if (!zswap_pool_tryget(pool)) continue; return pool; } From 6e1fa555ec772046ec3b903f507ff7fed5323796 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:20 -0700 Subject: [PATCH 163/215] mm: zswap: modify zswap_stored_pages to be atomic_long_t For zswap_store() to support large folios, we need to be able to do a batch update of zswap_stored_pages upon successful store of all pages in the folio. For this, we need to add folio_nr_pages(), which returns a long, to zswap_stored_pages. Link: https://lkml.kernel.org/r/20241001053222.6944-6-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Acked-by: Yosry Ahmed Acked-by: Johannes Weiner Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- fs/proc/meminfo.c | 2 +- include/linux/zswap.h | 2 +- mm/zswap.c | 19 +++++++++++++------ 3 files changed, 15 insertions(+), 8 deletions(-) diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 245171d9164be..8ba9b14723902 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -91,7 +91,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) #ifdef CONFIG_ZSWAP show_val_kb(m, "Zswap: ", zswap_total_pages()); seq_printf(m, "Zswapped: %8lu kB\n", - (unsigned long)atomic_read(&zswap_stored_pages) << + (unsigned long)atomic_long_read(&zswap_stored_pages) << (PAGE_SHIFT - 10)); #endif show_val_kb(m, "Dirty: ", diff --git a/include/linux/zswap.h b/include/linux/zswap.h index 9cd1beef06548..d961ead91bf1e 100644 --- a/include/linux/zswap.h +++ b/include/linux/zswap.h @@ -7,7 +7,7 @@ struct lruvec; -extern atomic_t zswap_stored_pages; +extern atomic_long_t zswap_stored_pages; #ifdef CONFIG_ZSWAP diff --git a/mm/zswap.c b/mm/zswap.c index bcb1b9cc9645b..1692273ff08c0 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -43,7 +43,7 @@ * statistics **********************************/ /* The number of compressed pages currently stored in zswap */ -atomic_t zswap_stored_pages = ATOMIC_INIT(0); +atomic_long_t zswap_stored_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -802,7 +802,7 @@ static void zswap_entry_free(struct zswap_entry *entry) obj_cgroup_put(entry->objcg); } zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); + atomic_long_dec(&zswap_stored_pages); } /********************************* @@ -1233,7 +1233,7 @@ static unsigned long zswap_shrinker_count(struct shrinker *shrinker, nr_stored = memcg_page_state(memcg, MEMCG_ZSWAPPED); } else { nr_backing = zswap_total_pages(); - nr_stored = atomic_read(&zswap_stored_pages); + nr_stored = atomic_long_read(&zswap_stored_pages); } if (!nr_stored) @@ -1502,7 +1502,7 @@ bool zswap_store(struct folio *folio) } /* update stats */ - atomic_inc(&zswap_stored_pages); + atomic_long_inc(&zswap_stored_pages); count_vm_event(ZSWPOUT); return true; @@ -1654,6 +1654,13 @@ static int debugfs_get_total_size(void *data, u64 *val) } DEFINE_DEBUGFS_ATTRIBUTE(total_size_fops, debugfs_get_total_size, NULL, "%llu\n"); +static int debugfs_get_stored_pages(void *data, u64 *val) +{ + *val = atomic_long_read(&zswap_stored_pages); + return 0; +} +DEFINE_DEBUGFS_ATTRIBUTE(stored_pages_fops, debugfs_get_stored_pages, NULL, "%llu\n"); + static int zswap_debugfs_init(void) { if (!debugfs_initialized()) @@ -1677,8 +1684,8 @@ static int zswap_debugfs_init(void) zswap_debugfs_root, &zswap_written_back_pages); debugfs_create_file("pool_total_size", 0444, zswap_debugfs_root, NULL, &total_size_fops); - debugfs_create_atomic_t("stored_pages", 0444, - zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_file("stored_pages", 0444, + zswap_debugfs_root, NULL, &stored_pages_fops); return 0; } From b7c0ccdfbafdec98699ddb6f164beebf16f0bc45 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:21 -0700 Subject: [PATCH 164/215] mm: zswap: support large folios in zswap_store() This series enables zswap_store() to accept and store large folios. The most significant contribution in this series is from the earlier RFC submitted by Ryan Roberts [1]. Ryan's original RFC has been migrated to mm-unstable as of 9-30-2024 in patch 6 of this series, and adapted based on code review comments received for the current patch-series. [1]: [RFC PATCH v1] mm: zswap: Store large folios without splitting https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u The first few patches do the prep work for supporting large folios in zswap_store. Patch 6 provides the main functionality to swap-out large folios in zswap. Patch 7 adds sysfs per-order hugepages "zswpout" counters that get incremented upon successful zswap_store of large folios, and also updates the documentation for this: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout This series is a pre-requisite for zswap compress batching of large folio swap-out and decompress batching of swap-ins based on swapin_readahead(), using Intel IAA hardware acceleration, which we would like to submit in subsequent patch-series, with performance improvement data. Thanks to Ying Huang for pre-posting review feedback and suggestions! Thanks also to Nhat, Yosry, Johannes, Barry, Chengming, Usama, Ying and Matthew for their helpful feedback, code/data reviews and suggestions! I would like to thank Ryan Roberts for his original RFC [1]. System setup for testing: ========================= Testing of this series was done with mm-unstable as of 9-27-2024, commit de2fbaa6d9c3576ec7133ed02a370ec9376bf000 (without this patch-series) and mm-unstable 9-30-2024 commit c121617e3606be6575cdacfdb63cc8d67b46a568 (with this patch-series). Data was gathered on an Intel Sapphire Rapids server, dual-socket 56 cores per socket, 4 IAA devices per socket, 503 GiB RAM and 525G SSD disk partition swap. Core frequency was fixed at 2500MHz. The vm-scalability "usemem" test was run in a cgroup whose memory.high was fixed at 150G. The is no swap limit set for the cgroup. 30 usemem processes were run, each allocating and writing 10G of memory, and sleeping for 10 sec before exiting: usemem --init-time -w -O -s 10 -n 30 10g Other kernel configuration parameters: zswap compressors : zstd, deflate-iaa zswap allocator : zsmalloc vm.page-cluster : 2 In the experiments where "deflate-iaa" is used as the zswap compressor, IAA "compression verification" is enabled by default (cat /sys/bus/dsa/drivers/crypto/verify_compress). Hence each IAA compression will be decompressed internally by the "iaa_crypto" driver, the crc-s returned by the hardware will be compared and errors reported in case of mismatches. Thus "deflate-iaa" helps ensure better data integrity as compared to the software compressors, and the experimental data listed below is with verify_compress set to "1". Metrics reporting methodology: ============================== Total and average throughput are derived from the individual 30 processes' throughputs reported by usemem. elapsed/sys times are measured with perf. All percentage changes are "new" vs. "old"; hence a positive value denotes an increase in the metric, whether it is throughput or latency, and a negative value denotes a reduction in the metric. Positive throughput change percentages and negative latency change percentages denote improvements. The vm stats and sysfs hugepages stats included with the performance data provide details on the swapout activity to zswap/swap device. Testing labels used in data summaries: ====================================== The data refers to these test configurations and the before/after comparisons that they do: before-case1: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=N (compares zswap 4K vs. zswap 64K) In this scenario, CONFIG_THP_SWAP=N results in 64K/2M folios to be split into 4K folios that get processed by zswap. before-case2: ------------- mm-unstable 9-27-2024, CONFIG_THP_SWAP=Y (compares SSD swap large folios vs. zswap large folios) In this scenario, CONFIG_THP_SWAP=Y results in zswap rejecting large folios, which will then be stored by the SSD swap device. after: ------ v10 of this patch-series, CONFIG_THP_SWAP=Y The "after" is CONFIG_THP_SWAP=Y and v10 of this patch-series, that results in 64K/2M folios to not be split, and to be processed by zswap_store. Regression Testing: =================== I ran vm-scalability usemem without large folios, i.e., only 4K folios with mm-unstable and this patch-series. The main goal was to make sure that there is no functional or performance regression wrt the earlier zswap behavior for 4K folios, now that 4K folios will be processed by the new zswap_store() code. The data indicates there is no significant regression. ------------------------------------------------------------------------------- 4K folios: ========== zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 4,793,363 4,880,978 4,853,074 1% -1% Average throughput (KB/s) 159,778 162,699 161,769 1% -1% elapsed time (sec) 130.14 123.17 126.29 -3% 3% sys time (sec) 3,135.53 2,985.64 3,083.18 -2% 3% memcg_high 446,826 444,626 452,930 memcg_swap_fail 0 0 0 zswpout 48,932,107 48,931,971 48,931,820 zswpin 383 386 397 pswpout 0 0 0 pswpin 0 0 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 0 0 0 pgmajfault 3,063 3,077 3,479 swap_ra 93 94 96 swap_ra_hit 47 47 50 ZSWPOUT-64kB n/a n/a 0 SWPOUT-64kB 0 0 0 ------------------------------------------------------------------------------- Performance Testing: ==================== We list the data for 64K folios with before/after data per-compressor, followed by the same for 2M pmd-mappable folios. ------------------------------------------------------------------------------- 64K folios: zstd: ================= zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,222,213 1,076,611 6,159,776 18% 472% Average throughput (KB/s) 174,073 35,887 205,325 18% 472% elapsed time (sec) 120.50 347.16 108.33 -10% -69% sys time (sec) 2,930.33 248.16 2,549.65 -13% 927% memcg_high 416,773 552,200 465,874 memcg_swap_fail 3,192,906 1,293 1,012 zswpout 48,931,583 20,903 48,931,218 zswpin 384 363 410 pswpout 0 40,778,448 0 pswpin 0 16 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 3,192,906 1,293 1,012 pgmajfault 3,452 3,072 3,061 swap_ra 90 87 107 swap_ra_hit 42 43 57 ZSWPOUT-64kB n/a n/a 3,057,173 SWPOUT-64kB 0 2,548,653 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 64K folios: deflate-iaa: ======================== zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,652,608 1,089,180 7,189,778 27% 560% Average throughput (KB/s) 188,420 36,306 239,659 27% 560% elapsed time (sec) 102.90 343.35 87.05 -15% -75% sys time (sec) 2,246.86 213.53 1,864.16 -17% 773% memcg_high 576,104 502,907 642,083 memcg_swap_fail 4,016,117 1,407 1,478 zswpout 61,163,423 22,444 57,798,716 zswpin 401 368 454 pswpout 0 40,862,080 0 pswpin 0 20 0 thp_swpout 0 0 0 thp_swpout_fallback 0 0 0 64kB-mthp_swpout_fallback 4,016,117 1,407 1,478 pgmajfault 3,063 3,153 3,122 swap_ra 96 93 156 swap_ra_hit 46 45 83 ZSWPOUT-64kB n/a n/a 3,611,032 SWPOUT-64kB 0 2,553,880 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: zstd: ================ zswap compressor zstd zstd zstd zstd v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 5,895,500 1,109,694 6,484,224 10% 484% Average throughput (KB/s) 196,516 36,989 216,140 10% 484% elapsed time (sec) 108.77 334.28 106.33 -2% -68% sys time (sec) 2,657.14 94.88 2,376.13 -11% 2404% memcg_high 64,200 66,316 56,898 memcg_swap_fail 101,182 70 27 zswpout 48,931,499 36,507 48,890,640 zswpin 380 379 377 pswpout 0 40,166,400 0 pswpin 0 0 0 thp_swpout 0 78,450 0 thp_swpout_fallback 101,182 70 27 2MB-mthp_swpout_fallback 0 0 27 pgmajfault 3,067 3,417 3,311 swap_ra 91 90 854 swap_ra_hit 45 45 810 ZSWPOUT-2MB n/a n/a 95,459 SWPOUT-2MB 0 78,450 0 ------------------------------------------------------------------------------- ------------------------------------------------------------------------------- 2M folios: deflate-iaa: ======================= zswap compressor deflate-iaa deflate-iaa deflate-iaa deflate-iaa v10 before-case1 before-case2 after vs. vs. case1 case2 ------------------------------------------------------------------------------- Total throughput (KB/s) 6,286,587 1,126,785 7,073,464 13% 528% Average throughput (KB/s) 209,552 37,559 235,782 13% 528% elapsed time (sec) 96.19 333.03 85.79 -11% -74% sys time (sec) 2,141.44 99.96 1,826.67 -15% 1727% memcg_high 99,253 64,666 79,718 memcg_swap_fail 129,074 53 165 zswpout 61,312,794 28,321 56,045,120 zswpin 383 406 403 pswpout 0 40,048,128 0 pswpin 0 0 0 thp_swpout 0 78,219 0 thp_swpout_fallback 129,074 53 165 2MB-mthp_swpout_fallback 0 0 165 pgmajfault 3,430 3,077 31,468 swap_ra 91 103 84,373 swap_ra_hit 47 46 84,317 ZSWPOUT-2MB n/a n/a 109,229 SWPOUT-2MB 0 78,219 0 ------------------------------------------------------------------------------- And finally, this is a comparison of deflate-iaa vs. zstd with v10 of this patch-series: --------------------------------------------- zswap_store large folios v10 Impr w/ deflate-iaa vs. zstd 64K folios 2M folios --------------------------------------------- Throughput (KB/s) 17% 9% elapsed time (sec) -20% -19% sys time (sec) -27% -23% --------------------------------------------- Conclusions based on the performance results: ============================================= v10 wrt before-case1: --------------------- We see significant improvements in throughput, elapsed and sys time for zstd and deflate-iaa, when comparing before-case1 (THP_SWAP=N) vs. after (THP_SWAP=Y) with zswap_store large folios. v10 wrt before-case2: --------------------- We see even more significant improvements in throughput and elapsed time for zstd and deflate-iaa, when comparing before-case2 (large-folio-SSD) vs. after (large-folio-zswap). The sys time increases with large-folio-zswap as expected, due to the CPU compression time vs. asynchronous disk write times, as pointed out by Ying and Yosry. In before-case2, when zswap does not store large folios, only allocations and cgroup charging due to 4K folio zswap stores count towards the cgroup memory limit. However, in the after scenario, with the introduction of zswap_store() of large folios, there is an added component of the zswap compressed pool usage from large folio stores from potentially all 30 processes, that gets counted towards the memory limit. As a result, we see higher swapout activity in the "after" data. Summary: ======== The v10 data presented above shows that zswap_store of large folios demonstrates good throughput/performance improvements compared to conventional SSD swap of large folios with a sufficiently large 525G SSD swap device. Hence, it seems reasonable for zswap_store to support large folios, so that further performance improvements can be implemented. In the experimental setup used in this patchset, we have enabled IAA compress verification to ensure additional hardware data integrity CRC checks not currently done by the software compressors. We see good throughput/latency improvements with deflate-iaa vs. zstd with zswap_store of large folios. Some of the ideas for further reducing latency that have shown promise in our experiments, are: 1) IAA compress/decompress batching. 2) Distributing compress jobs across all IAA devices on the socket. The tests run for this patchset are using only 1 IAA device per core, that avails of 2 compress engines on the device. In our experiments with IAA batching, we distribute compress jobs from all cores to the 8 compress engines available per socket. We further compress the pages in each folio in parallel in the accelerator. As a result, we improve compress latency and reclaim throughput. In decompress batching, we use swapin_readahead to generate a prefetch batch of 4K folios that we decompress in parallel in IAA. ------------------------------------------------------------------------------ IAA compress/decompress batching Further improvements wrt v10 zswap_store Sequential subpage store using "deflate-iaa": "deflate-iaa" Batching "deflate-iaa-canned" [2] Batching Additional Impr Additional Impr 64K folios 2M folios 64K folios 2M folios ------------------------------------------------------------------------------ Throughput (KB/s) 19% 43% 26% 55% elapsed time (sec) -5% -14% -10% -21% sys time (sec) 4% -7% -4% -18% ------------------------------------------------------------------------------ With zswap IAA compress/decompress batching, we are able to demonstrate significant performance improvements and memory savings in server scalability experiments in highly contended system scenarios under significant memory pressure; as compared to software compressors. We hope to submit this work in subsequent patch series. The current patch-series is a prequisite for these future submissions. This patch (of 7): zswap_store() will store large folios by compressing them page by page. This patch provides a sequential implementation of storing a large folio in zswap_store() by iterating through each page in the folio to compress and store it in the zswap zpool. zswap_store() calls the newly added zswap_store_page() function for each page in the folio. zswap_store_page() handles compressing and storing each page. We check the global and per-cgroup limits once at the beginning of zswap_store(), and only check that the limit is not reached yet. This is racy and inaccurate, but it should be sufficient for now. We also obtain initial references to the relevant objcg and pool to guarantee that subsequent references can be acquired by zswap_store_page(). A new function zswap_pool_get() is added to facilitate this. If these one-time checks pass, we compress the pages of the folio, while maintaining a running count of compressed bytes for all the folio's pages. If all pages are successfully compressed and stored, we do the cgroup zswap charging with the total compressed bytes, and batch update the zswap_stored_pages atomic/zswpout event stats with folio_nr_pages() once, before returning from zswap_store(). If an error is encountered during the store of any page in the folio, all pages in that folio currently stored in zswap will be invalidated. Thus, a folio is either entirely stored in zswap, or entirely not stored in zswap. The most important value provided by this patch is it enables swapping out large folios to zswap without splitting them. Furthermore, it batches some operations while doing so (cgroup charging, stats updates). This patch also forms the basis for building compress batching of pages in a large folio in zswap_store() by compressing up to say, 8 pages of the folio in parallel in hardware using the Intel In-Memory Analytics Accelerator (Intel IAA). This change reuses and adapts the functionality in Ryan Roberts' RFC patch [1]: "[RFC,v1] mm: zswap: Store large folios without splitting" [1] https://lore.kernel.org/linux-mm/20231019110543.3284654-1-ryan.roberts@arm.com/T/#u Link: https://lkml.kernel.org/r/20241001053222.6944-1-kanchana.p.sridhar@intel.com Link: https://lkml.kernel.org/r/20241001053222.6944-7-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Originally-by: Ryan Roberts Acked-by: Johannes Weiner Acked-by: Yosry Ahmed Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Matthew Wilcox Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- mm/zswap.c | 189 ++++++++++++++++++++++++++++++++++------------------- 1 file changed, 121 insertions(+), 68 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index 1692273ff08c0..d6b1e81860b99 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -410,6 +410,12 @@ static int __must_check zswap_pool_tryget(struct zswap_pool *pool) return percpu_ref_tryget(&pool->ref); } +/* The caller must already have a reference. */ +static void zswap_pool_get(struct zswap_pool *pool) +{ + percpu_ref_get(&pool->ref); +} + static void zswap_pool_put(struct zswap_pool *pool) { percpu_ref_put(&pool->ref); @@ -1403,68 +1409,38 @@ static void shrink_worker(struct work_struct *w) /********************************* * main API **********************************/ -bool zswap_store(struct folio *folio) + +static ssize_t zswap_store_page(struct page *page, + struct obj_cgroup *objcg, + struct zswap_pool *pool) { - swp_entry_t swp = folio->swap; - pgoff_t offset = swp_offset(swp); - struct xarray *tree = swap_zswap_tree(swp); struct zswap_entry *entry, *old; - struct obj_cgroup *objcg = NULL; - struct mem_cgroup *memcg = NULL; - - VM_WARN_ON_ONCE(!folio_test_locked(folio)); - VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); - - /* Large folios aren't supported */ - if (folio_test_large(folio)) - return false; - - if (!zswap_enabled) - goto check_old; - - /* Check cgroup limits */ - objcg = get_obj_cgroup_from_folio(folio); - if (objcg && !obj_cgroup_may_zswap(objcg)) { - memcg = get_mem_cgroup_from_objcg(objcg); - if (shrink_memcg(memcg)) { - mem_cgroup_put(memcg); - goto reject; - } - mem_cgroup_put(memcg); - } - - if (zswap_check_limits()) - goto reject; /* allocate entry */ - entry = zswap_entry_cache_alloc(GFP_KERNEL, folio_nid(folio)); + entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; goto reject; } - /* if entry is successfully added, it keeps the reference */ - entry->pool = zswap_pool_current_get(); - if (!entry->pool) - goto freepage; + /* zswap_store() already holds a ref on 'objcg' and 'pool' */ + if (objcg) + obj_cgroup_get(objcg); + zswap_pool_get(pool); - if (objcg) { - memcg = get_mem_cgroup_from_objcg(objcg); - if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { - mem_cgroup_put(memcg); - goto put_pool; - } - mem_cgroup_put(memcg); - } + /* if entry is successfully added, it keeps the reference */ + entry->pool = pool; - if (!zswap_compress(&folio->page, entry)) - goto put_pool; + if (!zswap_compress(page, entry)) + goto put_pool_objcg; - entry->swpentry = swp; + entry->swpentry = page_swap_entry(page); entry->objcg = objcg; entry->referenced = true; - old = xa_store(tree, offset, entry, GFP_KERNEL); + old = xa_store(swap_zswap_tree(entry->swpentry), + swp_offset(entry->swpentry), + entry, GFP_KERNEL); if (xa_is_err(old)) { int err = xa_err(old); @@ -1481,11 +1457,6 @@ bool zswap_store(struct folio *folio) if (old) zswap_entry_free(old); - if (objcg) { - obj_cgroup_charge_zswap(objcg, entry->length); - count_objcg_events(objcg, ZSWPOUT, 1); - } - /* * We finish initializing the entry while it's already in xarray. * This is safe because: @@ -1501,32 +1472,114 @@ bool zswap_store(struct folio *folio) zswap_lru_add(&zswap_list_lru, entry); } - /* update stats */ - atomic_long_inc(&zswap_stored_pages); - count_vm_event(ZSWPOUT); - - return true; + /* + * We shouldn't have any possibility of failure after the entry is + * added in the xarray. The pool/objcg refs obtained here will only + * be dropped if/when zswap_entry_free() gets called. + */ + return entry->length; store_failed: zpool_free(entry->pool->zpool, entry->handle); -put_pool: - zswap_pool_put(entry->pool); -freepage: +put_pool_objcg: + zswap_pool_put(pool); + obj_cgroup_put(objcg); zswap_entry_cache_free(entry); reject: + return -EINVAL; +} + +bool zswap_store(struct folio *folio) +{ + long nr_pages = folio_nr_pages(folio); + swp_entry_t swp = folio->swap; + struct obj_cgroup *objcg = NULL; + struct mem_cgroup *memcg = NULL; + struct zswap_pool *pool; + size_t compressed_bytes = 0; + bool ret = false; + long index; + + VM_WARN_ON_ONCE(!folio_test_locked(folio)); + VM_WARN_ON_ONCE(!folio_test_swapcache(folio)); + + if (!zswap_enabled) + goto check_old; + + objcg = get_obj_cgroup_from_folio(folio); + if (objcg && !obj_cgroup_may_zswap(objcg)) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (shrink_memcg(memcg)) { + mem_cgroup_put(memcg); + goto put_objcg; + } + mem_cgroup_put(memcg); + } + + if (zswap_check_limits()) + goto put_objcg; + + pool = zswap_pool_current_get(); + if (!pool) + goto put_objcg; + + if (objcg) { + memcg = get_mem_cgroup_from_objcg(objcg); + if (memcg_list_lru_alloc(memcg, &zswap_list_lru, GFP_KERNEL)) { + mem_cgroup_put(memcg); + goto put_pool; + } + mem_cgroup_put(memcg); + } + + for (index = 0; index < nr_pages; ++index) { + struct page *page = folio_page(folio, index); + ssize_t bytes; + + bytes = zswap_store_page(page, objcg, pool); + if (bytes < 0) + goto put_pool; + compressed_bytes += bytes; + } + + if (objcg) { + obj_cgroup_charge_zswap(objcg, compressed_bytes); + count_objcg_events(objcg, ZSWPOUT, nr_pages); + } + + atomic_long_add(nr_pages, &zswap_stored_pages); + count_vm_events(ZSWPOUT, nr_pages); + + ret = true; + +put_pool: + zswap_pool_put(pool); +put_objcg: obj_cgroup_put(objcg); - if (zswap_pool_reached_full) + if (!ret && zswap_pool_reached_full) queue_work(shrink_wq, &zswap_shrink_work); check_old: /* - * If the zswap store fails or zswap is disabled, we must invalidate the - * possibly stale entry which was previously stored at this offset. - * Otherwise, writeback could overwrite the new data in the swapfile. + * If the zswap store fails or zswap is disabled, we must invalidate + * the possibly stale entries which were previously stored at the + * offsets corresponding to each page of the folio. Otherwise, + * writeback could overwrite the new data in the swapfile. */ - entry = xa_erase(tree, offset); - if (entry) - zswap_entry_free(entry); - return false; + if (!ret) { + unsigned type = swp_type(swp); + pgoff_t offset = swp_offset(swp); + struct zswap_entry *entry; + struct xarray *tree; + + for (index = 0; index < nr_pages; ++index) { + tree = swap_zswap_tree(swp_entry(type, offset + index)); + entry = xa_erase(tree, offset + index); + if (entry) + zswap_entry_free(entry); + } + } + + return ret; } bool zswap_load(struct folio *folio) From 0c560dd86040556a9e55d88229d9295672428c78 Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Mon, 30 Sep 2024 22:32:22 -0700 Subject: [PATCH 165/215] mm: swap: count successful large folio zswap stores in hugepage zswpout stats Added a new MTHP_STAT_ZSWPOUT entry to the sysfs transparent_hugepage stats so that successful large folio zswap stores can be accounted under the per-order sysfs "zswpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/zswpout Other non-zswap swap device swap-out events will be counted under the existing sysfs "swpout" stats: /sys/kernel/mm/transparent_hugepage/hugepages-*kB/stats/swpout Also, added documentation for the newly added sysfs per-order hugepage "zswpout" stats. The documentation clarifies that only non-zswap swapouts will be accounted in the existing "swpout" stats. Link: https://lkml.kernel.org/r/20241001053222.6944-8-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Reviewed-by: Nhat Pham Cc: Chengming Zhou Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Ryan Roberts Cc: Shakeel Butt Cc: Usama Arif Cc: Wajdi Feghali Cc: Yosry Ahmed Cc: "Zou, Nanhai" Cc: Barry Song <21cnbao@gmail.com> Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 8 ++++++-- include/linux/huge_mm.h | 1 + mm/huge_memory.c | 3 +++ mm/page_io.c | 1 + 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index a1bb495eab59a..e8db1543e0b97 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -530,10 +530,14 @@ anon_fault_fallback_charge instead falls back to using huge pages with lower orders or small pages even though the allocation was successful. -swpout - is incremented every time a huge page is swapped out in one +zswpout + is incremented every time a huge page is swapped out to zswap in one piece without splitting. +swpout + is incremented every time a huge page is swapped out to a non-zswap + swap device in one piece without splitting. + swpout_fallback is incremented if a huge page has to be split before swapout. Usually because failed to allocate some continuous swap space diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 006f730545c2a..c59e5aa9b081f 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -119,6 +119,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_ALLOC, MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, + MTHP_STAT_ZSWPOUT, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 61fc407330f23..b26c6503e9934 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -615,6 +615,7 @@ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); +DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -633,6 +634,7 @@ static struct attribute *anon_stats_attrs[] = { &anon_fault_fallback_attr.attr, &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM + &zswpout_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -663,6 +665,7 @@ static struct attribute_group file_stats_attr_grp = { static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM + &zswpout_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif diff --git a/mm/page_io.c b/mm/page_io.c index 39e8be23bd95b..e9be4b436fd84 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -277,6 +277,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) swap_zeromap_folio_clear(folio); } if (zswap_store(folio)) { + count_mthp_stat(folio_order(folio), MTHP_STAT_ZSWPOUT); folio_unlock(folio); return 0; } From ed882add6ded66ece28eed8714aa18acdfb90b0c Mon Sep 17 00:00:00 2001 From: Kanchana P Sridhar Date: Wed, 2 Oct 2024 10:33:29 -0700 Subject: [PATCH 166/215] mm: zswap: zswap_store_page() will initialize entry after adding to xarray. This incorporates Yosry's suggestions in [1] for further simplifying zswap_store_page(). If the page is successfully compressed and added to the xarray, we get the pool/objcg refs, and initialize all the entry's members. Only after this, we add it to the zswap LRU. In the time between the entry's addition to the xarray and it's member initialization, we are protected against concurrent stores/loads/swapoff through the folio lock, and are protected against writeback because the entry is not on the LRU yet. This way, we don't have to drop the pool/objcg refs, now that the entry initialization is centralized to the successful page store code path. zswap_compress() is modified to take a zswap_pool parameter in keeping with this simplification (as against obtaining this from entry->pool). [1]: https://lore.kernel.org/all/CAJD7tkZh6ufHQef5HjXf_F5b5LC1EATexgseD=4WvrO+a6Ni6w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20241002173329.213722-1-kanchana.p.sridhar@intel.com Signed-off-by: Kanchana P Sridhar Cc: Chengming Zhou Cc: Huang Ying Cc: Johannes Weiner Cc: Nhat Pham Cc: Ryan Roberts Cc: Wajdi Feghali Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/zswap.c | 56 +++++++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index d6b1e81860b99..b68f80e1f8066 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -881,7 +881,8 @@ static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node) return 0; } -static bool zswap_compress(struct page *page, struct zswap_entry *entry) +static bool zswap_compress(struct page *page, struct zswap_entry *entry, + struct zswap_pool *pool) { struct crypto_acomp_ctx *acomp_ctx; struct scatterlist input, output; @@ -893,7 +894,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry) gfp_t gfp; u8 *dst; - acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx); + acomp_ctx = raw_cpu_ptr(pool->acomp_ctx); mutex_lock(&acomp_ctx->mutex); @@ -926,7 +927,7 @@ static bool zswap_compress(struct page *page, struct zswap_entry *entry) if (comp_ret) goto unlock; - zpool = entry->pool->zpool; + zpool = pool->zpool; gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM; if (zpool_malloc_support_movable(zpool)) gfp |= __GFP_HIGHMEM | __GFP_MOVABLE; @@ -1414,32 +1415,21 @@ static ssize_t zswap_store_page(struct page *page, struct obj_cgroup *objcg, struct zswap_pool *pool) { + swp_entry_t page_swpentry = page_swap_entry(page); struct zswap_entry *entry, *old; /* allocate entry */ entry = zswap_entry_cache_alloc(GFP_KERNEL, page_to_nid(page)); if (!entry) { zswap_reject_kmemcache_fail++; - goto reject; + return -EINVAL; } - /* zswap_store() already holds a ref on 'objcg' and 'pool' */ - if (objcg) - obj_cgroup_get(objcg); - zswap_pool_get(pool); - - /* if entry is successfully added, it keeps the reference */ - entry->pool = pool; + if (!zswap_compress(page, entry, pool)) + goto compress_failed; - if (!zswap_compress(page, entry)) - goto put_pool_objcg; - - entry->swpentry = page_swap_entry(page); - entry->objcg = objcg; - entry->referenced = true; - - old = xa_store(swap_zswap_tree(entry->swpentry), - swp_offset(entry->swpentry), + old = xa_store(swap_zswap_tree(page_swpentry), + swp_offset(page_swpentry), entry, GFP_KERNEL); if (xa_is_err(old)) { int err = xa_err(old); @@ -1457,6 +1447,16 @@ static ssize_t zswap_store_page(struct page *page, if (old) zswap_entry_free(old); + /* + * The entry is successfully compressed and stored in the tree, there is + * no further possibility of failure. Grab refs to the pool and objcg. + * These refs will be dropped by zswap_entry_free() when the entry is + * removed from the tree. + */ + zswap_pool_get(pool); + if (objcg) + obj_cgroup_get(objcg); + /* * We finish initializing the entry while it's already in xarray. * This is safe because: @@ -1467,25 +1467,21 @@ static ssize_t zswap_store_page(struct page *page, * The publishing order matters to prevent writeback from seeing * an incoherent entry. */ + entry->pool = pool; + entry->swpentry = page_swpentry; + entry->objcg = objcg; + entry->referenced = true; if (entry->length) { INIT_LIST_HEAD(&entry->lru); zswap_lru_add(&zswap_list_lru, entry); } - /* - * We shouldn't have any possibility of failure after the entry is - * added in the xarray. The pool/objcg refs obtained here will only - * be dropped if/when zswap_entry_free() gets called. - */ return entry->length; store_failed: - zpool_free(entry->pool->zpool, entry->handle); -put_pool_objcg: - zswap_pool_put(pool); - obj_cgroup_put(objcg); + zpool_free(pool->zpool, entry->handle); +compress_failed: zswap_entry_cache_free(entry); -reject: return -EINVAL; } From aaf2914aec0fa67395574f6fa6b726168b049e60 Mon Sep 17 00:00:00 2001 From: Barry Song Date: Sat, 26 Oct 2024 21:24:23 +1300 Subject: [PATCH 167/215] mm: add per-order mTHP swpin counters This helps profile the sizes of folios being swapped in. Currently, only mTHP swap-out is being counted. The new interface can be found at: /sys/kernel/mm/transparent_hugepage/hugepages-/stats swpin For example, cat /sys/kernel/mm/transparent_hugepage/hugepages-64kB/stats/swpin 12809 cat /sys/kernel/mm/transparent_hugepage/hugepages-32kB/stats/swpin 4763 [v-songbaohua@oppo.com: add a blank line in doc] Link: https://lkml.kernel.org/r/20241030233423.80759-1-21cnbao@gmail.com Link: https://lkml.kernel.org/r/20241026082423.26298-1-21cnbao@gmail.com Signed-off-by: Barry Song Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Chris Li Cc: Yosry Ahmed Cc: "Huang, Ying" Cc: Kairui Song Cc: Ryan Roberts Cc: Kanchana P Sridhar Cc: Usama Arif Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 4 ++++ include/linux/huge_mm.h | 1 + mm/huge_memory.c | 3 +++ mm/page_io.c | 3 +++ 4 files changed, 11 insertions(+) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index e8db1543e0b97..abdf10a1c7db5 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -534,6 +534,10 @@ zswpout is incremented every time a huge page is swapped out to zswap in one piece without splitting. +swpin + is incremented every time a huge page is swapped in from a non-zswap + swap device in one piece. + swpout is incremented every time a huge page is swapped out to a non-zswap swap device in one piece without splitting. diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c59e5aa9b081f..b94c2e8ee9188 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -120,6 +120,7 @@ enum mthp_stat_item { MTHP_STAT_ANON_FAULT_FALLBACK, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE, MTHP_STAT_ZSWPOUT, + MTHP_STAT_SWPIN, MTHP_STAT_SWPOUT, MTHP_STAT_SWPOUT_FALLBACK, MTHP_STAT_SHMEM_ALLOC, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b26c6503e9934..f920688644692 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -616,6 +616,7 @@ DEFINE_MTHP_STAT_ATTR(anon_fault_alloc, MTHP_STAT_ANON_FAULT_ALLOC); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback, MTHP_STAT_ANON_FAULT_FALLBACK); DEFINE_MTHP_STAT_ATTR(anon_fault_fallback_charge, MTHP_STAT_ANON_FAULT_FALLBACK_CHARGE); DEFINE_MTHP_STAT_ATTR(zswpout, MTHP_STAT_ZSWPOUT); +DEFINE_MTHP_STAT_ATTR(swpin, MTHP_STAT_SWPIN); DEFINE_MTHP_STAT_ATTR(swpout, MTHP_STAT_SWPOUT); DEFINE_MTHP_STAT_ATTR(swpout_fallback, MTHP_STAT_SWPOUT_FALLBACK); #ifdef CONFIG_SHMEM @@ -635,6 +636,7 @@ static struct attribute *anon_stats_attrs[] = { &anon_fault_fallback_charge_attr.attr, #ifndef CONFIG_SHMEM &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif @@ -666,6 +668,7 @@ static struct attribute_group file_stats_attr_grp = { static struct attribute *any_stats_attrs[] = { #ifdef CONFIG_SHMEM &zswpout_attr.attr, + &swpin_attr.attr, &swpout_attr.attr, &swpout_fallback_attr.attr, #endif diff --git a/mm/page_io.c b/mm/page_io.c index e9be4b436fd84..4b4ea8e49cf69 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -495,6 +495,7 @@ static void sio_read_complete(struct kiocb *iocb, long ret) for (p = 0; p < sio->pages; p++) { struct folio *folio = page_folio(sio->bvec[p].bv_page); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); folio_mark_uptodate(folio); folio_unlock(folio); @@ -589,6 +590,7 @@ static void swap_read_folio_bdev_sync(struct folio *folio, * attempt to access it in the page fault retry time check. */ get_task_struct(current); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio_wait(&bio); @@ -605,6 +607,7 @@ static void swap_read_folio_bdev_async(struct folio *folio, bio->bi_iter.bi_sector = swap_folio_sector(folio); bio->bi_end_io = end_swap_bio_read; bio_add_folio_nofail(bio, folio, folio_size(folio), 0); + count_mthp_stat(folio_order(folio), MTHP_STAT_SWPIN); count_memcg_folio_events(folio, PSWPIN, folio_nr_pages(folio)); count_vm_events(PSWPIN, folio_nr_pages(folio)); submit_bio(bio); From ae193dd79398970ee760e0c8129ac42ef8f5c6ff Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 16 Oct 2024 18:18:00 +0500 Subject: [PATCH 168/215] kasan: move checks to do_strncpy_from_user Patch series "kasan: migrate the last module test to kunit", v4. copy_user_test() is the last KUnit-incompatible test with CONFIG_KASAN_MODULE_TEST requirement, which we are going to migrate to KUnit framework and delete the former test and Kconfig as well. In this patch series: - [1/3] move kasan_check_write() and check_object_size() to do_strncpy_from_user() to cover with KASAN checks with multiple conditions in strncpy_from_user(). - [2/3] migrated copy_user_test() to KUnit, where we can also test strncpy_from_user() due to [1/4]. KUnits have been tested on: - x86_64 with CONFIG_KASAN_GENERIC. Passed - arm64 with CONFIG_KASAN_SW_TAGS. 1 fail. See [1] - arm64 with CONFIG_KASAN_HW_TAGS. 1 fail. See [1] [1] https://lore.kernel.org/linux-mm/CACzwLxj21h7nCcS2-KA_q7ybe+5pxH0uCDwu64q_9pPsydneWQ@mail.gmail.com/ - [3/3] delete CONFIG_KASAN_MODULE_TEST and documentation occurrences. This patch (of 3): Since in the commit 2865baf54077("x86: support user address masking instead of non-speculative conditional") do_strncpy_from_user() is called from multiple places, we should sanitize the kernel *dst memory and size which were done in strncpy_from_user() previously. Link: https://lkml.kernel.org/r/20241016131802.3115788-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20241016131802.3115788-2-snovitoll@gmail.com Fixes: 2865baf54077 ("x86: support user address masking instead of non-speculative conditional") Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Alex Shi Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Marco Elver Cc: Vincenzo Frascino Cc: Yanteng Si Signed-off-by: Andrew Morton --- lib/strncpy_from_user.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 989a12a678721..6dc234913dd58 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -120,6 +120,9 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (unlikely(count <= 0)) return 0; + kasan_check_write(dst, count); + check_object_size(dst, count, false); + if (can_do_masked_user_access()) { long retval; @@ -142,8 +145,6 @@ long strncpy_from_user(char *dst, const char __user *src, long count) if (max > count) max = count; - kasan_check_write(dst, count); - check_object_size(dst, count, false); if (user_read_access_begin(src, max)) { retval = do_strncpy_from_user(dst, src, count, max); user_read_access_end(); From ca79a00bb9a899674a63018c6cd155a3730c3509 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 16 Oct 2024 18:18:01 +0500 Subject: [PATCH 169/215] kasan: migrate copy_user_test to kunit Migrate the copy_user_test to the KUnit framework to verify out-of-bound detection via KASAN reports in copy_from_user(), copy_to_user() and their static functions. This is the last migrated test in kasan_test_module.c, therefore delete the file. [arnd@arndb.de: export copy_to_kernel_nofault] Link: https://lkml.kernel.org/r/20241018151112.3533820-1-arnd@kernel.org Link: https://lkml.kernel.org/r/20241016131802.3115788-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Signed-off-by: Arnd Bergmann Reviewed-by: Andrey Konovalov Acked-by: David Hildenbrand Cc: Alexander Potapenko Cc: Alex Shi Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Marco Elver Cc: Vincenzo Frascino Cc: Yanteng Si Signed-off-by: Andrew Morton --- mm/kasan/Makefile | 2 - mm/kasan/kasan_test_c.c | 47 +++++++++++++++++++++ mm/kasan/kasan_test_module.c | 81 ------------------------------------ mm/maccess.c | 1 + 4 files changed, 48 insertions(+), 83 deletions(-) delete mode 100644 mm/kasan/kasan_test_module.c diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index b88543e5c0cce..1a958e7c8a464 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -46,7 +46,6 @@ endif CFLAGS_kasan_test_c.o := $(CFLAGS_KASAN_TEST) RUSTFLAGS_kasan_test_rust.o := $(RUSTFLAGS_KASAN) -CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o @@ -59,4 +58,3 @@ ifdef CONFIG_RUST endif obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o -obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index fe132ce3c2b34..fd5058c5d0f75 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -1963,6 +1963,52 @@ static void copy_to_kernel_nofault_oob(struct kunit *test) kfree(ptr); } +static void copy_user_test_oob(struct kunit *test) +{ + char *kmem; + char __user *usermem; + unsigned long useraddr; + size_t size = 128 - KASAN_GRANULE_SIZE; + int __maybe_unused unused; + + kmem = kunit_kmalloc(test, size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, kmem); + + useraddr = kunit_vm_mmap(test, NULL, 0, PAGE_SIZE, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_ANONYMOUS | MAP_PRIVATE, 0); + KUNIT_ASSERT_NE_MSG(test, useraddr, 0, + "Could not create userspace mm"); + KUNIT_ASSERT_LT_MSG(test, useraddr, (unsigned long)TASK_SIZE, + "Failed to allocate user memory"); + + OPTIMIZER_HIDE_VAR(size); + usermem = (char __user *)useraddr; + + KUNIT_EXPECT_KASAN_FAIL(test, + unused = copy_from_user(kmem, usermem, size + 1)); + KUNIT_EXPECT_KASAN_FAIL(test, + unused = copy_to_user(usermem, kmem, size + 1)); + KUNIT_EXPECT_KASAN_FAIL(test, + unused = __copy_from_user(kmem, usermem, size + 1)); + KUNIT_EXPECT_KASAN_FAIL(test, + unused = __copy_to_user(usermem, kmem, size + 1)); + KUNIT_EXPECT_KASAN_FAIL(test, + unused = __copy_from_user_inatomic(kmem, usermem, size + 1)); + KUNIT_EXPECT_KASAN_FAIL(test, + unused = __copy_to_user_inatomic(usermem, kmem, size + 1)); + + /* + * Prepare a long string in usermem to avoid the strncpy_from_user test + * bailing out on '\0' before it reaches out-of-bounds. + */ + memset(kmem, 'a', size); + KUNIT_EXPECT_EQ(test, copy_to_user(usermem, kmem, size), 0); + + KUNIT_EXPECT_KASAN_FAIL(test, + unused = strncpy_from_user(kmem, usermem, size + 1)); +} + static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_right), KUNIT_CASE(kmalloc_oob_left), @@ -2037,6 +2083,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(match_all_mem_tag), KUNIT_CASE(copy_to_kernel_nofault_oob), KUNIT_CASE(rust_uaf), + KUNIT_CASE(copy_user_test_oob), {} }; diff --git a/mm/kasan/kasan_test_module.c b/mm/kasan/kasan_test_module.c deleted file mode 100644 index 27ec22767e422..0000000000000 --- a/mm/kasan/kasan_test_module.c +++ /dev/null @@ -1,81 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * - * Copyright (c) 2014 Samsung Electronics Co., Ltd. - * Author: Andrey Ryabinin - */ - -#define pr_fmt(fmt) "kasan: test: " fmt - -#include -#include -#include -#include -#include - -#include "kasan.h" - -static noinline void __init copy_user_test(void) -{ - char *kmem; - char __user *usermem; - size_t size = 128 - KASAN_GRANULE_SIZE; - int __maybe_unused unused; - - kmem = kmalloc(size, GFP_KERNEL); - if (!kmem) - return; - - usermem = (char __user *)vm_mmap(NULL, 0, PAGE_SIZE, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_ANONYMOUS | MAP_PRIVATE, 0); - if (IS_ERR(usermem)) { - pr_err("Failed to allocate user memory\n"); - kfree(kmem); - return; - } - - OPTIMIZER_HIDE_VAR(size); - - pr_info("out-of-bounds in copy_from_user()\n"); - unused = copy_from_user(kmem, usermem, size + 1); - - pr_info("out-of-bounds in copy_to_user()\n"); - unused = copy_to_user(usermem, kmem, size + 1); - - pr_info("out-of-bounds in __copy_from_user()\n"); - unused = __copy_from_user(kmem, usermem, size + 1); - - pr_info("out-of-bounds in __copy_to_user()\n"); - unused = __copy_to_user(usermem, kmem, size + 1); - - pr_info("out-of-bounds in __copy_from_user_inatomic()\n"); - unused = __copy_from_user_inatomic(kmem, usermem, size + 1); - - pr_info("out-of-bounds in __copy_to_user_inatomic()\n"); - unused = __copy_to_user_inatomic(usermem, kmem, size + 1); - - pr_info("out-of-bounds in strncpy_from_user()\n"); - unused = strncpy_from_user(kmem, usermem, size + 1); - - vm_munmap((unsigned long)usermem, PAGE_SIZE); - kfree(kmem); -} - -static int __init kasan_test_module_init(void) -{ - /* - * Temporarily enable multi-shot mode. Otherwise, KASAN would only - * report the first detected bug and panic the kernel if panic_on_warn - * is enabled. - */ - bool multishot = kasan_save_enable_multi_shot(); - - copy_user_test(); - - kasan_restore_multi_shot(multishot); - return -EAGAIN; -} - -module_init(kasan_test_module_init); -MODULE_LICENSE("GPL"); diff --git a/mm/maccess.c b/mm/maccess.c index 3ca55ec63a6aa..8f0906180a944 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -82,6 +82,7 @@ long copy_to_kernel_nofault(void *dst, const void *src, size_t size) pagefault_enable(); return -EFAULT; } +EXPORT_SYMBOL_GPL(copy_to_kernel_nofault); long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count) { From 4e4d9c72c946b77f0278988d0bf1207fa1b2cd0f Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Wed, 16 Oct 2024 18:18:02 +0500 Subject: [PATCH 170/215] kasan: delete CONFIG_KASAN_MODULE_TEST Since we've migrated all tests to the KUnit framework, we can delete CONFIG_KASAN_MODULE_TEST and mentioning of it in the documentation as well. I've used the online translator to modify the non-English documentation. [snovitoll@gmail.com: fix indentation in translation] Link: https://lkml.kernel.org/r/20241020042813.3223449-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20241016131802.3115788-4-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Alex Shi Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Hu Haowen <2023002089@link.tyut.edu.cn> Cc: Jonathan Corbet Cc: Marco Elver Cc: Vincenzo Frascino Cc: Yanteng Si Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 23 ++++++++----------- .../translations/zh_CN/dev-tools/kasan.rst | 20 +++++++--------- .../translations/zh_TW/dev-tools/kasan.rst | 21 ++++++++--------- lib/Kconfig.kasan | 7 ------ mm/kasan/kasan.h | 2 +- mm/kasan/report.c | 2 +- 6 files changed, 28 insertions(+), 47 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index d7de44f5339d4..0a1418ab72fdf 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -511,19 +511,14 @@ Tests ~~~~~ There are KASAN tests that allow verifying that KASAN works and can detect -certain types of memory corruptions. The tests consist of two parts: +certain types of memory corruptions. -1. Tests that are integrated with the KUnit Test Framework. Enabled with -``CONFIG_KASAN_KUNIT_TEST``. These tests can be run and partially verified +All KASAN tests are integrated with the KUnit Test Framework and can be enabled +via ``CONFIG_KASAN_KUNIT_TEST``. The tests can be run and partially verified automatically in a few different ways; see the instructions below. -2. Tests that are currently incompatible with KUnit. Enabled with -``CONFIG_KASAN_MODULE_TEST`` and can only be run as a module. These tests can -only be verified manually by loading the kernel module and inspecting the -kernel log for KASAN reports. - -Each KUnit-compatible KASAN test prints one of multiple KASAN reports if an -error is detected. Then the test prints its number and status. +Each KASAN test prints one of multiple KASAN reports if an error is detected. +Then the test prints its number and status. When a test passes:: @@ -550,16 +545,16 @@ Or, if one of the tests failed:: not ok 1 - kasan -There are a few ways to run KUnit-compatible KASAN tests. +There are a few ways to run the KASAN tests. 1. Loadable module - With ``CONFIG_KUNIT`` enabled, KASAN-KUnit tests can be built as a loadable - module and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``. + With ``CONFIG_KUNIT`` enabled, the tests can be built as a loadable module + and run by loading ``kasan_test.ko`` with ``insmod`` or ``modprobe``. 2. Built-In - With ``CONFIG_KUNIT`` built-in, KASAN-KUnit tests can be built-in as well. + With ``CONFIG_KUNIT`` built-in, the tests can be built-in as well. In this case, the tests will run at boot as a late-init call. 3. Using kunit_tool diff --git a/Documentation/translations/zh_CN/dev-tools/kasan.rst b/Documentation/translations/zh_CN/dev-tools/kasan.rst index 4491ad2830ed9..fd2e3afbdfad0 100644 --- a/Documentation/translations/zh_CN/dev-tools/kasan.rst +++ b/Documentation/translations/zh_CN/dev-tools/kasan.rst @@ -422,16 +422,12 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 ~~~~ 有一些KASAN测试可以验证KASAN是否正常工作并可以检测某些类型的内存损坏。 -测试由两部分组成: -1. 与KUnit测试框架集成的测试。使用 ``CONFIG_KASAN_KUNIT_TEST`` 启用。 -这些测试可以通过几种不同的方式自动运行和部分验证;请参阅下面的说明。 +所有 KASAN 测试都与 KUnit 测试框架集成,可通过 ``CONFIG_KASAN_KUNIT_TEST`` 启用。 +测试可以通过几种不同的方式自动运行和部分验证;请参阅以下说明。 -2. 与KUnit不兼容的测试。使用 ``CONFIG_KASAN_MODULE_TEST`` 启用并且只能作为模块 -运行。这些测试只能通过加载内核模块并检查内核日志以获取KASAN报告来手动验证。 - -如果检测到错误,每个KUnit兼容的KASAN测试都会打印多个KASAN报告之一,然后测试打印 -其编号和状态。 +如果检测到错误,每个 KASAN 测试都会打印多份 KASAN 报告中的一份。 +然后测试会打印其编号和状态。 当测试通过:: @@ -458,16 +454,16 @@ KASAN连接到vmap基础架构以懒清理未使用的影子内存。 not ok 1 - kasan -有几种方法可以运行与KUnit兼容的KASAN测试。 +有几种方法可以运行 KASAN 测试。 1. 可加载模块 - 启用 ``CONFIG_KUNIT`` 后,KASAN-KUnit测试可以构建为可加载模块,并通过使用 - ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。 + 启用 ``CONFIG_KUNIT`` 后,可以将测试构建为可加载模块 + 并通过使用 ``insmod`` 或 ``modprobe`` 加载 ``kasan_test.ko`` 来运行。 2. 内置 - 通过内置 ``CONFIG_KUNIT`` ,也可以内置KASAN-KUnit测试。在这种情况下, + 通过内置 ``CONFIG_KUNIT``,测试也可以内置。 测试将在启动时作为后期初始化调用运行。 3. 使用kunit_tool diff --git a/Documentation/translations/zh_TW/dev-tools/kasan.rst b/Documentation/translations/zh_TW/dev-tools/kasan.rst index ed342e67d8ed0..27fb7645174d0 100644 --- a/Documentation/translations/zh_TW/dev-tools/kasan.rst +++ b/Documentation/translations/zh_TW/dev-tools/kasan.rst @@ -404,16 +404,13 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 ~~~~ 有一些KASAN測試可以驗證KASAN是否正常工作並可以檢測某些類型的內存損壞。 -測試由兩部分組成: -1. 與KUnit測試框架集成的測試。使用 ``CONFIG_KASAN_KUNIT_TEST`` 啓用。 -這些測試可以通過幾種不同的方式自動運行和部分驗證;請參閱下面的說明。 +所有 KASAN 測試均與 KUnit 測試框架集成,並且可以啟用 +透過 ``CONFIG_KASAN_KUNIT_TEST``。可以運行測試並進行部分驗證 +以幾種不同的方式自動進行;請參閱下面的說明。 -2. 與KUnit不兼容的測試。使用 ``CONFIG_KASAN_MODULE_TEST`` 啓用並且只能作爲模塊 -運行。這些測試只能通過加載內核模塊並檢查內核日誌以獲取KASAN報告來手動驗證。 - -如果檢測到錯誤,每個KUnit兼容的KASAN測試都會打印多個KASAN報告之一,然後測試打印 -其編號和狀態。 +如果偵測到錯誤,每個 KASAN 測試都會列印多個 KASAN 報告之一。 +然後測試列印其編號和狀態。 當測試通過:: @@ -440,16 +437,16 @@ KASAN連接到vmap基礎架構以懶清理未使用的影子內存。 not ok 1 - kasan -有幾種方法可以運行與KUnit兼容的KASAN測試。 +有幾種方法可以執行 KASAN 測試。 1. 可加載模塊 - 啓用 ``CONFIG_KUNIT`` 後,KASAN-KUnit測試可以構建爲可加載模塊,並通過使用 - ``insmod`` 或 ``modprobe`` 加載 ``kasan_test.ko`` 來運行。 + 啟用 ``CONFIG_KUNIT`` 後,測試可以建置為可載入模組 + 並且透過使用 ``insmod`` 或 ``modprobe`` 來載入 ``kasan_test.ko`` 來運作。 2. 內置 - 通過內置 ``CONFIG_KUNIT`` ,也可以內置KASAN-KUnit測試。在這種情況下, + 透過內建 ``CONFIG_KUNIT``,測試也可以內建。 測試將在啓動時作爲後期初始化調用運行。 3. 使用kunit_tool diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 98016e137b7f0..f82889a830fae 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -195,13 +195,6 @@ config KASAN_KUNIT_TEST For more information on KUnit and unit tests in general, please refer to the KUnit documentation in Documentation/dev-tools/kunit/. -config KASAN_MODULE_TEST - tristate "KUnit-incompatible tests of KASAN bug detection capabilities" - depends on m && KASAN && !KASAN_HW_TAGS - help - A part of the KASAN test suite that is not integrated with KUnit. - Incompatible with Hardware Tag-Based KASAN. - config KASAN_EXTRA_INFO bool "Record and report more information" depends on KASAN diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index f438a6cdc964f..b7e4b81421b3f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -568,7 +568,7 @@ static inline void kasan_kunit_test_suite_end(void) { } #endif /* CONFIG_KASAN_KUNIT_TEST */ -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) bool kasan_save_enable_multi_shot(void); void kasan_restore_multi_shot(bool enabled); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index b48c768acc84d..3e48668c3e40a 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -132,7 +132,7 @@ static bool report_enabled(void) return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags); } -#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) || IS_ENABLED(CONFIG_KASAN_MODULE_TEST) +#if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) bool kasan_save_enable_multi_shot(void) { From 5f6170a469cd2c13ad4dffe42714cf777b132451 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:27 +0000 Subject: [PATCH 171/215] mm: pagewalk: add the ability to install PTEs Patch series "implement lightweight guard pages", v4. Userland library functions such as allocators and threading implementations often require regions of memory to act as 'guard pages' - mappings which, when accessed, result in a fatal signal being sent to the accessing process. The current means by which these are implemented is via a PROT_NONE mmap() mapping, which provides the required semantics however incur an overhead of a VMA for each such region. With a great many processes and threads, this can rapidly add up and incur a significant memory penalty. It also has the added problem of preventing merges that might otherwise be permitted. This series takes a different approach - an idea suggested by Vlastimil Babka (and before him David Hildenbrand and Jann Horn - perhaps more - the provenance becomes a little tricky to ascertain after this - please forgive any omissions!) - rather than locating the guard pages at the VMA layer, instead placing them in page tables mapping the required ranges. Early testing of the prototype version of this code suggests a 5 times speed up in memory mapping invocations (in conjunction with use of process_madvise()) and a 13% reduction in VMAs on an entirely idle android system and unoptimised code. We expect with optimisation and a loaded system with a larger number of guard pages this could significantly increase, but in any case these numbers are encouraging. This way, rather than having separate VMAs specifying which parts of a range are guard pages, instead we have a VMA spanning the entire range of memory a user is permitted to access and including ranges which are to be 'guarded'. After mapping this, a user can specify which parts of the range should result in a fatal signal when accessed. By restricting the ability to specify guard pages to memory mapped by existing VMAs, we can rely on the mappings being torn down when the mappings are ultimately unmapped and everything works simply as if the memory were not faulted in, from the point of view of the containing VMAs. This mechanism in effect poisons memory ranges similar to hardware memory poisoning, only it is an entirely software-controlled form of poisoning. The mechanism is implemented via madvise() behaviour - MADV_GUARD_INSTALL which installs page table-level guard page markers - and MADV_GUARD_REMOVE - which clears them. Guard markers can be installed across multiple VMAs and any existing mappings will be cleared, that is zapped, before installing the guard page markers in the page tables. There is no concept of 'nested' guard markers, multiple attempts to install guard markers in a range will, after the first attempt, have no effect. Importantly, removing guard markers over a range that contains both guard markers and ordinary backed memory has no effect on anything but the guard markers (including leaving huge pages un-split), so a user can safely remove guard markers over a range of memory leaving the rest intact. The actual mechanism by which the page table entries are specified makes use of existing logic - PTE markers, which are used for the userfaultfd UFFDIO_POISON mechanism. Unfortunately PTE_MARKER_POISONED is not suited for the guard page mechanism as it results in VM_FAULT_HWPOISON semantics in the fault handler, so we add our own specific PTE_MARKER_GUARD and adapt existing logic to handle it. We also extend the generic page walk mechanism to allow for installation of PTEs (carefully restricted to memory management logic only to prevent unwanted abuse). We ensure that zapping performed by MADV_DONTNEED and MADV_FREE do not remove guard markers, nor does forking (except when VM_WIPEONFORK is specified for a VMA which implies a total removal of memory characteristics). It's important to note that the guard page implementation is emphatically NOT a security feature, so a user can remove the markers if they wish. We simply implement it in such a way as to provide the least surprising behaviour. An extensive set of self-tests are provided which ensure behaviour is as expected and additionally self-documents expected behaviour of guard ranges. This patch (of 5): The existing generic pagewalk logic permits the walking of page tables, invoking callbacks at individual page table levels via user-provided mm_walk_ops callbacks. This is useful for traversing existing page table entries, but precludes the ability to establish new ones. Existing mechanism for performing a walk which also installs page table entries if necessary are heavily duplicated throughout the kernel, each with semantic differences from one another and largely unavailable for use elsewhere. Rather than add yet another implementation, we extend the generic pagewalk logic to enable the installation of page table entries by adding a new install_pte() callback in mm_walk_ops. If this is specified, then upon encountering a missing page table entry, we allocate and install a new one and continue the traversal. If a THP huge page is encountered at either the PMD or PUD level we split it only if there are ops->pte_entry() (or ops->pmd_entry at PUD level), otherwise if there is only an ops->install_pte(), we avoid the unnecessary split. We do not support hugetlb at this stage. If this function returns an error, or an allocation fails during the operation, we abort the operation altogether. It is up to the caller to deal appropriately with partially populated page table ranges. If install_pte() is defined, the semantics of pte_entry() change - this callback is then only invoked if the entry already exists. This is a useful property, as it allows a caller to handle existing PTEs while installing new ones where necessary in the specified range. If install_pte() is not defined, then there is no functional difference to this patch, so all existing logic will work precisely as it did before. As we only permit the installation of PTEs where a mapping does not already exist there is no need for TLB management, however we do invoke update_mmu_cache() for architectures which require manual maintenance of mappings for other CPUs. We explicitly do not allow the existing page walk API to expose this feature as it is dangerous and intended for internal mm use only. Therefore we provide a new walk_page_range_mm() function exposed only to mm/internal.h. We take the opportunity to additionally clean up the page walker logic to be a little easier to follow. Link: https://lkml.kernel.org/r/cover.1730123433.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/51b432ebef013e3fdf9f92101533435de1bffadf.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Jann Horn Reviewed-by: Vlastimil Babka Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Shuah Khan Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 18 ++- mm/internal.h | 6 + mm/pagewalk.c | 246 ++++++++++++++++++++++++++++----------- 3 files changed, 201 insertions(+), 69 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index f5eb5a32aeed9..9700a29f8afbc 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -25,12 +25,15 @@ enum page_walk_lock { * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each PTE (lowest-level) entry, - * including empty ones + * @pte_entry: if set, called for each PTE (lowest-level) entry + * including empty ones, except if @install_pte is set. + * If @install_pte is set, @pte_entry is called only for + * existing PTEs. * @pte_hole: if set, called for each hole at all levels, * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. * Any folded depths (where PTRS_PER_P?D is equal to 1) - * are skipped. + * are skipped. If @install_pte is specified, this will + * not trigger for any populated ranges. * @hugetlb_entry: if set, called for each hugetlb entry. This hook * function is called with the vma lock held, in order to * protect against a concurrent freeing of the pte_t* or @@ -51,6 +54,13 @@ enum page_walk_lock { * @pre_vma: if set, called before starting walk on a non-null vma. * @post_vma: if set, called after a walk on a non-null vma, provided * that @pre_vma and the vma walk succeeded. + * @install_pte: if set, missing page table entries are installed and + * thus all levels are always walked in the specified + * range. This callback is then invoked at the PTE level + * (having split any THP pages prior), providing the PTE to + * install. If allocations fail, the walk is aborted. This + * operation is only available for userland memory. Not + * usable for hugetlb ranges. * * p?d_entry callbacks are called even if those levels are folded on a * particular architecture/configuration. @@ -76,6 +86,8 @@ struct mm_walk_ops { int (*pre_vma)(unsigned long start, unsigned long end, struct mm_walk *walk); void (*post_vma)(struct mm_walk *walk); + int (*install_pte)(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk); enum page_walk_lock walk_lock; }; diff --git a/mm/internal.h b/mm/internal.h index fd6373cb1c66d..d5b93c5b63648 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -1502,4 +1503,9 @@ static inline void accept_page(struct page *page) } #endif /* CONFIG_UNACCEPTED_MEMORY */ +/* pagewalk.c */ +int walk_page_range_mm(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private); + #endif /* __MM_INTERNAL_H */ diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 5f9f01532e67a..e478777c86e19 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -3,9 +3,14 @@ #include #include #include +#include #include #include +#include + +#include "internal.h" + /* * We want to know the real level where a entry is located ignoring any * folding of levels which may be happening. For example if p4d is folded then @@ -29,9 +34,23 @@ static int walk_pte_range_inner(pte_t *pte, unsigned long addr, int err = 0; for (;;) { - err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); - if (err) - break; + if (ops->install_pte && pte_none(ptep_get(pte))) { + pte_t new_pte; + + err = ops->install_pte(addr, addr + PAGE_SIZE, &new_pte, + walk); + if (err) + break; + + set_pte_at(walk->mm, addr, pte, new_pte); + /* Non-present before, so for arches that need it. */ + if (!WARN_ON_ONCE(walk->no_vma)) + update_mmu_cache(walk->vma, addr, pte); + } else { + err = ops->pte_entry(pte, addr, addr + PAGE_SIZE, walk); + if (err) + break; + } if (addr >= end - PAGE_SIZE) break; addr += PAGE_SIZE; @@ -81,6 +100,8 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd_t *pmd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(3); @@ -89,11 +110,14 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, again: next = pmd_addr_end(addr, end); if (pmd_none(*pmd)) { - if (ops->pte_hole) + if (has_install) + err = __pte_alloc(walk->mm, pmd); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } walk->action = ACTION_SUBTREE; @@ -109,18 +133,25 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, if (walk->action == ACTION_AGAIN) goto again; - - /* - * Check this here so we only break down trans_huge - * pages when we _need_ to - */ - if ((!walk->vma && (pmd_leaf(*pmd) || !pmd_present(*pmd))) || - walk->action == ACTION_CONTINUE || - !(ops->pte_entry)) + if (walk->action == ACTION_CONTINUE) continue; + if (!has_handler) { /* No handlers for lower page tables. */ + if (!has_install) + continue; /* Nothing to do. */ + /* + * We are ONLY installing, so avoid unnecessarily + * splitting a present huge page. + */ + if (pmd_present(*pmd) && + (pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) + continue; + } + if (walk->vma) split_huge_pmd(walk->vma, pmd, addr); + else if (pmd_leaf(*pmd) || !pmd_present(*pmd)) + continue; /* Nothing to do. */ err = walk_pte_range(pmd, addr, next, walk); if (err) @@ -140,6 +171,8 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, pud_t *pud; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pmd_entry || ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(2); @@ -148,11 +181,14 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, again: next = pud_addr_end(addr, end); if (pud_none(*pud)) { - if (ops->pte_hole) + if (has_install) + err = __pmd_alloc(walk->mm, pud, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } walk->action = ACTION_SUBTREE; @@ -164,14 +200,26 @@ static int walk_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end, if (walk->action == ACTION_AGAIN) goto again; - - if ((!walk->vma && (pud_leaf(*pud) || !pud_present(*pud))) || - walk->action == ACTION_CONTINUE || - !(ops->pmd_entry || ops->pte_entry)) + if (walk->action == ACTION_CONTINUE) continue; + if (!has_handler) { /* No handlers for lower page tables. */ + if (!has_install) + continue; /* Nothing to do. */ + /* + * We are ONLY installing, so avoid unnecessarily + * splitting a present huge page. + */ + if (pud_present(*pud) && + (pud_trans_huge(*pud) || pud_devmap(*pud))) + continue; + } + if (walk->vma) split_huge_pud(walk->vma, pud, addr); + else if (pud_leaf(*pud) || !pud_present(*pud)) + continue; /* Nothing to do. */ + if (pud_none(*pud)) goto again; @@ -189,6 +237,8 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, p4d_t *p4d; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->pud_entry || ops->pmd_entry || ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; int depth = real_depth(1); @@ -196,18 +246,21 @@ static int walk_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, do { next = p4d_addr_end(addr, end); if (p4d_none_or_clear_bad(p4d)) { - if (ops->pte_hole) + if (has_install) + err = __pud_alloc(walk->mm, p4d, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, depth, walk); if (err) break; - continue; + if (!has_install) + continue; } if (ops->p4d_entry) { err = ops->p4d_entry(p4d, addr, next, walk); if (err) break; } - if (ops->pud_entry || ops->pmd_entry || ops->pte_entry) + if (has_handler || has_install) err = walk_pud_range(p4d, addr, next, walk); if (err) break; @@ -222,6 +275,9 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, pgd_t *pgd; unsigned long next; const struct mm_walk_ops *ops = walk->ops; + bool has_handler = ops->p4d_entry || ops->pud_entry || ops->pmd_entry || + ops->pte_entry; + bool has_install = ops->install_pte; int err = 0; if (walk->pgd) @@ -231,18 +287,21 @@ static int walk_pgd_range(unsigned long addr, unsigned long end, do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) { - if (ops->pte_hole) + if (has_install) + err = __p4d_alloc(walk->mm, pgd, addr); + else if (ops->pte_hole) err = ops->pte_hole(addr, next, 0, walk); if (err) break; - continue; + if (!has_install) + continue; } if (ops->pgd_entry) { err = ops->pgd_entry(pgd, addr, next, walk); if (err) break; } - if (ops->p4d_entry || ops->pud_entry || ops->pmd_entry || ops->pte_entry) + if (has_handler || has_install) err = walk_p4d_range(pgd, addr, next, walk); if (err) break; @@ -334,6 +393,11 @@ static int __walk_page_range(unsigned long start, unsigned long end, int err = 0; struct vm_area_struct *vma = walk->vma; const struct mm_walk_ops *ops = walk->ops; + bool is_hugetlb = is_vm_hugetlb_page(vma); + + /* We do not support hugetlb PTE installation. */ + if (ops->install_pte && is_hugetlb) + return -EINVAL; if (ops->pre_vma) { err = ops->pre_vma(start, end, walk); @@ -341,7 +405,7 @@ static int __walk_page_range(unsigned long start, unsigned long end, return err; } - if (is_vm_hugetlb_page(vma)) { + if (is_hugetlb) { if (ops->hugetlb_entry) err = walk_hugetlb_range(start, end, walk); } else @@ -380,47 +444,14 @@ static inline void process_vma_walk_lock(struct vm_area_struct *vma, #endif } -/** - * walk_page_range - walk page table with caller specific callbacks - * @mm: mm_struct representing the target process of page table walk - * @start: start address of the virtual address range - * @end: end address of the virtual address range - * @ops: operation to call during the walk - * @private: private data for callbacks' usage - * - * Recursively walk the page table tree of the process represented by @mm - * within the virtual address range [@start, @end). During walking, we can do - * some caller-specific works for each entry, by setting up pmd_entry(), - * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these - * callbacks, the associated entries/pages are just ignored. - * The return values of these callbacks are commonly defined like below: - * - * - 0 : succeeded to handle the current entry, and if you don't reach the - * end address yet, continue to walk. - * - >0 : succeeded to handle the current entry, and return to the caller - * with caller specific value. - * - <0 : failed to handle the current entry, and return to the caller - * with error code. - * - * Before starting to walk page table, some callers want to check whether - * they really want to walk over the current vma, typically by checking - * its vm_flags. walk_page_test() and @ops->test_walk() are used for this - * purpose. - * - * If operations need to be staged before and committed after a vma is walked, - * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), - * since it is intended to handle commit-type operations, can't return any - * errors. - * - * struct mm_walk keeps current values of some common data like vma and pmd, - * which are useful for the access from callbacks. If you want to pass some - * caller-specific data to callbacks, @private should be helpful. +/* + * See the comment for walk_page_range(), this performs the heavy lifting of the + * operation, only sets no restrictions on how the walk proceeds. * - * Locking: - * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, - * because these function traverse vma list and/or access to vma's data. + * We usually restrict the ability to install PTEs, but this functionality is + * available to internal memory management code and provided in mm/internal.h. */ -int walk_page_range(struct mm_struct *mm, unsigned long start, +int walk_page_range_mm(struct mm_struct *mm, unsigned long start, unsigned long end, const struct mm_walk_ops *ops, void *private) { @@ -479,6 +510,80 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } +/* + * Determine if the walk operations specified are permitted to be used for a + * page table walk. + * + * This check is performed on all functions which are parameterised by walk + * operations and exposed in include/linux/pagewalk.h. + * + * Internal memory management code can use the walk_page_range_mm() function to + * be able to use all page walking operations. + */ +static bool check_ops_valid(const struct mm_walk_ops *ops) +{ + /* + * The installation of PTEs is solely under the control of memory + * management logic and subject to many subtle locking, security and + * cache considerations so we cannot permit other users to do so, and + * certainly not for exported symbols. + */ + if (ops->install_pte) + return false; + + return true; +} + +/** + * walk_page_range - walk page table with caller specific callbacks + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @private: private data for callbacks' usage + * + * Recursively walk the page table tree of the process represented by @mm + * within the virtual address range [@start, @end). During walking, we can do + * some caller-specific works for each entry, by setting up pmd_entry(), + * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these + * callbacks, the associated entries/pages are just ignored. + * The return values of these callbacks are commonly defined like below: + * + * - 0 : succeeded to handle the current entry, and if you don't reach the + * end address yet, continue to walk. + * - >0 : succeeded to handle the current entry, and return to the caller + * with caller specific value. + * - <0 : failed to handle the current entry, and return to the caller + * with error code. + * + * Before starting to walk page table, some callers want to check whether + * they really want to walk over the current vma, typically by checking + * its vm_flags. walk_page_test() and @ops->test_walk() are used for this + * purpose. + * + * If operations need to be staged before and committed after a vma is walked, + * there are two callbacks, pre_vma() and post_vma(). Note that post_vma(), + * since it is intended to handle commit-type operations, can't return any + * errors. + * + * struct mm_walk keeps current values of some common data like vma and pmd, + * which are useful for the access from callbacks. If you want to pass some + * caller-specific data to callbacks, @private should be helpful. + * + * Locking: + * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock, + * because these function traverse vma list and/or access to vma's data. + */ +int walk_page_range(struct mm_struct *mm, unsigned long start, + unsigned long end, const struct mm_walk_ops *ops, + void *private) +{ + if (!check_ops_valid(ops)) + return -EINVAL; + + return walk_page_range_mm(mm, start, end, ops, private); +} + /** * walk_page_range_novma - walk a range of pagetables not backed by a vma * @mm: mm_struct representing the target process of page table walk @@ -494,7 +599,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, * walking the kernel pages tables or page tables for firmware. * * Note: Be careful to walk the kernel pages tables, the caller may be need to - * take other effective approache (mmap lock may be insufficient) to prevent + * take other effective approaches (mmap lock may be insufficient) to prevent * the intermediate kernel page tables belonging to the specified address range * from being freed (e.g. memory hot-remove). */ @@ -513,6 +618,8 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start, if (start >= end || !walk.mm) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; /* * 1) For walking the user virtual address space: @@ -556,6 +663,8 @@ int walk_page_range_vma(struct vm_area_struct *vma, unsigned long start, return -EINVAL; if (start < vma->vm_start || end > vma->vm_end) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); @@ -574,6 +683,8 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops, if (!walk.mm) return -EINVAL; + if (!check_ops_valid(ops)) + return -EINVAL; process_mm_walk_lock(walk.mm, ops->walk_lock); process_vma_walk_lock(vma, ops->walk_lock); @@ -623,6 +734,9 @@ int walk_page_mapping(struct address_space *mapping, pgoff_t first_index, unsigned long start_addr, end_addr; int err = 0; + if (!check_ops_valid(ops)) + return -EINVAL; + lockdep_assert_held(&mapping->i_mmap_rwsem); vma_interval_tree_foreach(vma, &mapping->i_mmap, first_index, first_index + nr - 1) { From 7c53dfbdb024915f23f03f972b33744309d4608b Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:28 +0000 Subject: [PATCH 172/215] mm: add PTE_MARKER_GUARD PTE marker Add a new PTE marker that results in any access causing the accessing process to segfault. This is preferable to PTE_MARKER_POISONED, which results in the same handling as hardware poisoned memory, and is thus undesirable for cases where we simply wish to 'soft' poison a range. This is in preparation for implementing the ability to specify guard pages at the page table level, i.e. ranges that, when accessed, should cause process termination. Additionally, rename zap_drop_file_uffd_wp() to zap_drop_markers() - the function checks the ZAP_FLAG_DROP_MARKER flag so naming it for this single purpose was simply incorrect. We then reuse the same logic to determine whether a zap should clear a guard entry - this should only be performed on teardown and never on MADV_DONTNEED or MADV_FREE. We additionally add a WARN_ON_ONCE() in hugetlb logic should a guard marker be encountered there, as we explicitly do not support this operation and this should not occur. Link: https://lkml.kernel.org/r/f47f3d5acca2dcf9bbf655b6d33f3dc713e4a4a0.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Vlastimil Babka Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 2 +- include/linux/swapops.h | 24 +++++++++++++++++++++++- mm/hugetlb.c | 4 ++++ mm/memory.c | 18 +++++++++++++++--- mm/mprotect.c | 6 ++++-- 5 files changed, 47 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 355cf46a01a61..1b6a917fffa4b 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -544,7 +544,7 @@ static inline pte_marker copy_pte_marker( { pte_marker srcm = pte_marker_get(entry); /* Always copy error entries. */ - pte_marker dstm = srcm & PTE_MARKER_POISONED; + pte_marker dstm = srcm & (PTE_MARKER_POISONED | PTE_MARKER_GUARD); /* Only copy PTE markers if UFFD register matches. */ if ((srcm & PTE_MARKER_UFFD_WP) && userfaultfd_wp(dst_vma)) diff --git a/include/linux/swapops.h b/include/linux/swapops.h index cb468e418ea11..96f26e29fefed 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -426,9 +426,19 @@ typedef unsigned long pte_marker; * "Poisoned" here is meant in the very general sense of "future accesses are * invalid", instead of referring very specifically to hardware memory errors. * This marker is meant to represent any of various different causes of this. + * + * Note that, when encountered by the faulting logic, PTEs with this marker will + * result in VM_FAULT_HWPOISON and thus regardless trigger hardware memory error + * logic. */ #define PTE_MARKER_POISONED BIT(1) -#define PTE_MARKER_MASK (BIT(2) - 1) +/* + * Indicates that, on fault, this PTE will case a SIGSEGV signal to be + * sent. This means guard markers behave in effect as if the region were mapped + * PROT_NONE, rather than if they were a memory hole or equivalent. + */ +#define PTE_MARKER_GUARD BIT(2) +#define PTE_MARKER_MASK (BIT(3) - 1) static inline swp_entry_t make_pte_marker_entry(pte_marker marker) { @@ -464,6 +474,18 @@ static inline int is_poisoned_swp_entry(swp_entry_t entry) { return is_pte_marker_entry(entry) && (pte_marker_get(entry) & PTE_MARKER_POISONED); + +} + +static inline swp_entry_t make_guard_swp_entry(void) +{ + return make_pte_marker_entry(PTE_MARKER_GUARD); +} + +static inline int is_guard_swp_entry(swp_entry_t entry) +{ + return is_pte_marker_entry(entry) && + (pte_marker_get(entry) & PTE_MARKER_GUARD); } /* diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 906294ac85dc8..2c8c5da0f5d32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6353,6 +6353,10 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; + goto out_mutex; } } diff --git a/mm/memory.c b/mm/memory.c index 2d32023d4eb87..75c2dfd04f725 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1455,7 +1455,7 @@ static inline bool should_zap_folio(struct zap_details *details, return !folio_test_anon(folio); } -static inline bool zap_drop_file_uffd_wp(struct zap_details *details) +static inline bool zap_drop_markers(struct zap_details *details) { if (!details) return false; @@ -1476,7 +1476,7 @@ zap_install_uffd_wp_if_needed(struct vm_area_struct *vma, if (vma_is_anonymous(vma)) return; - if (zap_drop_file_uffd_wp(details)) + if (zap_drop_markers(details)) return; for (;;) { @@ -1671,7 +1671,15 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, * drop the marker if explicitly requested. */ if (!vma_is_anonymous(vma) && - !zap_drop_file_uffd_wp(details)) + !zap_drop_markers(details)) + continue; + } else if (is_guard_swp_entry(entry)) { + /* + * Ordinary zapping should not remove guard PTE + * markers. Only do so if we should remove PTE markers + * in general. + */ + if (!zap_drop_markers(details)) continue; } else if (is_hwpoison_entry(entry) || is_poisoned_swp_entry(entry)) { @@ -4003,6 +4011,10 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) if (marker & PTE_MARKER_POISONED) return VM_FAULT_HWPOISON; + /* Hitting a guard page is always a fatal condition. */ + if (marker & PTE_MARKER_GUARD) + return VM_FAULT_SIGSEGV; + if (pte_marker_entry_uffd_wp(entry)) return pte_marker_handle_uffd_wp(vmf); diff --git a/mm/mprotect.c b/mm/mprotect.c index 6f450af3252eb..516b1d847e2cd 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -236,9 +236,11 @@ static long change_pte_range(struct mmu_gather *tlb, } else if (is_pte_marker_entry(entry)) { /* * Ignore error swap entries unconditionally, - * because any access should sigbus anyway. + * because any access should sigbus/sigsegv + * anyway. */ - if (is_poisoned_swp_entry(entry)) + if (is_poisoned_swp_entry(entry) || + is_guard_swp_entry(entry)) continue; /* * If this is uffd-wp pte marker and we'd like From 662df3e5c37666d6ed75c88098699e070a4b35b5 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:29 +0000 Subject: [PATCH 173/215] mm: madvise: implement lightweight guard page mechanism Implement a new lightweight guard page feature, that is regions of userland virtual memory that, when accessed, cause a fatal signal to arise. Currently users must establish PROT_NONE ranges to achieve this. However this is very costly memory-wise - we need a VMA for each and every one of these regions AND they become unmergeable with surrounding VMAs. In addition repeated mmap() calls require repeated kernel context switches and contention of the mmap lock to install these ranges, potentially also having to unmap memory if installed over existing ranges. The lightweight guard approach eliminates the VMA cost altogether - rather than establishing a PROT_NONE VMA, it operates at the level of page table entries - establishing PTE markers such that accesses to them cause a fault followed by a SIGSGEV signal being raised. This is achieved through the PTE marker mechanism, which we have already extended to provide PTE_MARKER_GUARD, which we installed via the generic page walking logic which we have extended for this purpose. These guard ranges are established with MADV_GUARD_INSTALL. If the range in which they are installed contain any existing mappings, they will be zapped, i.e. free the range and unmap memory (thus mimicking the behaviour of MADV_DONTNEED in this respect). Any existing guard entries will be left untouched. There is therefore no nesting of guarded pages. Guarded ranges are NOT cleared by MADV_DONTNEED nor MADV_FREE (in both instances the memory range may be reused at which point a user would expect guards to still be in place), but they are cleared via MADV_GUARD_REMOVE, process teardown or unmapping of memory ranges. The guard property can be removed from ranges via MADV_GUARD_REMOVE. The ranges over which this is applied, should they contain non-guard entries, will be untouched, with only guard entries being cleared. We permit this operation on anonymous memory only, and only VMAs which are non-special, non-huge and not mlock()'d (if we permitted this we'd have to drop locked pages which would be rather counterintuitive). Racing page faults can cause repeated attempts to install guard pages that are interrupted, result in a zap, and this process can end up being repeated. If this happens more than would be expected in normal operation, we rescind locks and retry the whole thing, which avoids lock contention in this scenario. Link: https://lkml.kernel.org/r/6aafb5821bf209f277dfae0787abb2ef87a37542.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Acked-by: Vlastimil Babka Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Signed-off-by: Andrew Morton --- arch/alpha/include/uapi/asm/mman.h | 3 + arch/mips/include/uapi/asm/mman.h | 3 + arch/parisc/include/uapi/asm/mman.h | 3 + arch/xtensa/include/uapi/asm/mman.h | 3 + include/uapi/asm-generic/mman-common.h | 3 + mm/madvise.c | 239 +++++++++++++++++++++++++ mm/mseal.c | 1 + 7 files changed, 255 insertions(+) diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 763929e814e9a..1e700468a6858 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -78,6 +78,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 9c48d9a21aa01..b700dae28c482 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -105,6 +105,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index 68c44f99bc931..b6a709506987e 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -75,6 +75,9 @@ #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 1ff0c858544fa..99d4ccee7f6e8 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -113,6 +113,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432c..1ea2c4c33b86a 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/mm/madvise.c b/mm/madvise.c index e871a72a6c329..0ceae57da7dad 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -37,6 +37,12 @@ #include "internal.h" #include "swap.h" +/* + * Maximum number of attempts we make to install guard pages before we give up + * and return -ERESTARTNOINTR to have userspace try again. + */ +#define MAX_MADVISE_GUARD_RETRIES 3 + struct madvise_walk_private { struct mmu_gather *tlb; bool pageout; @@ -60,6 +66,8 @@ static int madvise_need_mmap_write(int behavior) case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: case MADV_COLLAPSE: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1017,6 +1025,214 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +static bool is_valid_guard_vma(struct vm_area_struct *vma, bool allow_locked) +{ + vm_flags_t disallowed = VM_SPECIAL | VM_HUGETLB; + + /* + * A user could lock after setting a guard range but that's fine, as + * they'd not be able to fault in. The issue arises when we try to zap + * existing locked VMAs. We don't want to do that. + */ + if (!allow_locked) + disallowed |= VM_LOCKED; + + if (!vma_is_anonymous(vma)) + return false; + + if ((vma->vm_flags & (VM_MAYWRITE | disallowed)) != VM_MAYWRITE) + return false; + + return true; +} + +static bool is_guard_pte_marker(pte_t ptent) +{ + return is_pte_marker(ptent) && + is_guard_swp_entry(pte_to_swp_entry(ptent)); +} + +static int guard_install_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge return >0 so we abort the operation + zap. */ + return pud_trans_huge(pudval) || pud_devmap(pudval); +} + +static int guard_install_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge return >0 so we abort the operation + zap. */ + return pmd_trans_huge(pmdval) || pmd_devmap(pmdval); +} + +static int guard_install_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t pteval = ptep_get(pte); + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* If there is already a guard page marker, we have nothing to do. */ + if (is_guard_pte_marker(pteval)) { + (*nr_pages)++; + + return 0; + } + + /* If populated return >0 so we abort the operation + zap. */ + return 1; +} + +static int guard_install_set_pte(unsigned long addr, unsigned long next, + pte_t *ptep, struct mm_walk *walk) +{ + unsigned long *nr_pages = (unsigned long *)walk->private; + + /* Simply install a PTE marker, this causes segfault on access. */ + *ptep = make_pte_marker(PTE_MARKER_GUARD); + (*nr_pages)++; + + return 0; +} + +static const struct mm_walk_ops guard_install_walk_ops = { + .pud_entry = guard_install_pud_entry, + .pmd_entry = guard_install_pmd_entry, + .pte_entry = guard_install_pte_entry, + .install_pte = guard_install_set_pte, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_install(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + long err; + int i; + + *prev = vma; + if (!is_valid_guard_vma(vma, /* allow_locked = */false)) + return -EINVAL; + + /* + * If we install guard markers, then the range is no longer + * empty from a page table perspective and therefore it's + * appropriate to have an anon_vma. + * + * This ensures that on fork, we copy page tables correctly. + */ + err = anon_vma_prepare(vma); + if (err) + return err; + + /* + * Optimistically try to install the guard marker pages first. If any + * non-guard pages are encountered, give up and zap the range before + * trying again. + * + * We try a few times before giving up and releasing back to userland to + * loop around, releasing locks in the process to avoid contention. This + * would only happen if there was a great many racing page faults. + * + * In most cases we should simply install the guard markers immediately + * with no zap or looping. + */ + for (i = 0; i < MAX_MADVISE_GUARD_RETRIES; i++) { + unsigned long nr_pages = 0; + + /* Returns < 0 on error, == 0 if success, > 0 if zap needed. */ + err = walk_page_range_mm(vma->vm_mm, start, end, + &guard_install_walk_ops, &nr_pages); + if (err < 0) + return err; + + if (err == 0) { + unsigned long nr_expected_pages = PHYS_PFN(end - start); + + VM_WARN_ON(nr_pages != nr_expected_pages); + return 0; + } + + /* + * OK some of the range have non-guard pages mapped, zap + * them. This leaves existing guard pages in place. + */ + zap_page_range_single(vma, start, end - start, NULL); + } + + /* + * We were unable to install the guard pages due to being raced by page + * faults. This should not happen ordinarily. We return to userspace and + * immediately retry, relieving lock contention. + */ + return restart_syscall(); +} + +static int guard_remove_pud_entry(pud_t *pud, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pud_t pudval = pudp_get(pud); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pud_trans_huge(pudval) || pud_devmap(pudval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pmd_entry(pmd_t *pmd, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pmd_t pmdval = pmdp_get(pmd); + + /* If huge, cannot have guard pages present, so no-op - skip. */ + if (pmd_trans_huge(pmdval) || pmd_devmap(pmdval)) + walk->action = ACTION_CONTINUE; + + return 0; +} + +static int guard_remove_pte_entry(pte_t *pte, unsigned long addr, + unsigned long next, struct mm_walk *walk) +{ + pte_t ptent = ptep_get(pte); + + if (is_guard_pte_marker(ptent)) { + /* Simply clear the PTE marker. */ + pte_clear_not_present_full(walk->mm, addr, pte, false); + update_mmu_cache(walk->vma, addr, pte); + } + + return 0; +} + +static const struct mm_walk_ops guard_remove_walk_ops = { + .pud_entry = guard_remove_pud_entry, + .pmd_entry = guard_remove_pmd_entry, + .pte_entry = guard_remove_pte_entry, + .walk_lock = PGWALK_RDLOCK, +}; + +static long madvise_guard_remove(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + *prev = vma; + /* + * We're ok with removing guards in mlock()'d ranges, as this is a + * non-destructive action. + */ + if (!is_valid_guard_vma(vma, /* allow_locked = */true)) + return -EINVAL; + + return walk_page_range(vma->vm_mm, start, end, + &guard_remove_walk_ops, NULL); +} + /* * Apply an madvise behavior to a region of a vma. madvise_update_vma * will handle splitting a vm area into separate areas, each area with its own @@ -1098,6 +1314,10 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; case MADV_COLLAPSE: return madvise_collapse(vma, prev, start, end); + case MADV_GUARD_INSTALL: + return madvise_guard_install(vma, prev, start, end); + case MADV_GUARD_REMOVE: + return madvise_guard_remove(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1197,6 +1417,8 @@ madvise_behavior_valid(int behavior) case MADV_DODUMP: case MADV_WIPEONFORK: case MADV_KEEPONFORK: + case MADV_GUARD_INSTALL: + case MADV_GUARD_REMOVE: #ifdef CONFIG_MEMORY_FAILURE case MADV_SOFT_OFFLINE: case MADV_HWPOISON: @@ -1490,6 +1712,23 @@ static ssize_t vector_madvise(struct mm_struct *mm, struct iov_iter *iter, while (iov_iter_count(iter)) { ret = do_madvise(mm, (unsigned long)iter_iov_addr(iter), iter_iov_len(iter), behavior); + /* + * An madvise operation is attempting to restart the syscall, + * but we cannot proceed as it would not be correct to repeat + * the operation in aggregate, and would be surprising to the + * user. + * + * As we have already dropped locks, it is safe to just loop and + * try again. We check for fatal signals in case we need exit + * early anyway. + */ + if (ret == -ERESTARTNOINTR) { + if (fatal_signal_pending(current)) { + ret = -EINTR; + break; + } + continue; + } if (ret < 0) break; iov_iter_advance(iter, iter_iov_len(iter)); diff --git a/mm/mseal.c b/mm/mseal.c index ece977bd21e1a..81d6e980e8a91 100644 --- a/mm/mseal.c +++ b/mm/mseal.c @@ -30,6 +30,7 @@ static bool is_madv_discard(int behavior) case MADV_REMOVE: case MADV_DONTFORK: case MADV_WIPEONFORK: + case MADV_GUARD_INSTALL: return true; } From 75d60eb30daafb966db0e45f38e4cdeb5e5ed79c Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:30 +0000 Subject: [PATCH 174/215] tools: testing: update tools UAPI header for mman-common.h Import the new MADV_GUARD_INSTALL/REMOVE madvise flags. Link: https://lkml.kernel.org/r/ada462fa73fa1defc114242e446ab625b8290b71.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/include/uapi/asm-generic/mman-common.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 6ce1f1ceb432c..1ea2c4c33b86a 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -79,6 +79,9 @@ #define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ +#define MADV_GUARD_INSTALL 102 /* fatal signal on access to range */ +#define MADV_GUARD_REMOVE 103 /* unguard range */ + /* compatibility flags */ #define MAP_FILE 0 From 876320d71f515407b81eb08a1d019f19f34907d7 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 28 Oct 2024 14:13:31 +0000 Subject: [PATCH 175/215] selftests/mm: add self tests for guard page feature Utilise the kselftest harmness to implement tests for the guard page implementation. We start by implement basic tests asserting that guard pages can be installed, removed and that touching guard pages result in SIGSEGV. We also assert that, in removing guard pages from a range, non-guard pages remain intact. We then examine different operations on regions containing guard markers behave to ensure correct behaviour: * Operations over multiple VMAs operate as expected. * Invoking MADV_GUARD_INSTALL / MADV_GUARD_REMOVE via process_madvise() in batches works correctly. * Ensuring that munmap() correctly tears down guard markers. * Using mprotect() to adjust protection bits does not in any way override or cause issues with guard markers. * Ensuring that splitting and merging VMAs around guard markers causes no issue - i.e. that a marker which 'belongs' to one VMA can function just as well 'belonging' to another. * Ensuring that madvise(..., MADV_DONTNEED) and madvise(..., MADV_FREE) do not remove guard markers. * Ensuring that mlock()'ing a range containing guard markers does not cause issues. * Ensuring that mremap() can move a guard range and retain guard markers. * Ensuring that mremap() can expand a guard range and retain guard markers (perhaps moving the range). * Ensuring that mremap() can shrink a guard range and retain guard markers. * Ensuring that forking a process correctly retains guard markers. * Ensuring that forking a VMA with VM_WIPEONFORK set behaves sanely. * Ensuring that lazyfree simply clears guard markers. * Ensuring that userfaultfd can co-exist with guard pages. * Ensuring that madvise(..., MADV_POPULATE_READ) and madvise(..., MADV_POPULATE_WRITE) error out when encountering guard markers. * Ensuring that madvise(..., MADV_COLD) and madvise(..., MADV_PAGEOUT) do not remove guard markers. If any test is unable to be run due to lack of permissions, that test is skipped. Link: https://lkml.kernel.org/r/c3dcca76b736bac0aeaf1dc085927536a253ac94.1730123433.git.lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Reviewed-by: Shuah Khan Suggested-by: Vlastimil Babka Suggested-by: Jann Horn Suggested-by: David Hildenbrand Cc: Arnd Bergmann Cc: Christian Brauner Cc: Christoph Hellwig Cc: Chris Zankel Cc: Helge Deller Cc: James E.J. Bottomley Cc: Jeff Xu Cc: John Hubbard Cc: Liam R. Howlett Cc: Matthew Wilcox (Oracle) Cc: Matt Turner Cc: Max Filippov Cc: Muchun Song Cc: Paul E. McKenney Cc: Richard Henderson Cc: Shuah Khan Cc: Sidhartha Kumar Cc: Suren Baghdasaryan Cc: Thomas Bogendoerfer Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/.gitignore | 1 + tools/testing/selftests/mm/Makefile | 1 + tools/testing/selftests/mm/guard-pages.c | 1243 ++++++++++++++++++++++ 3 files changed, 1245 insertions(+) create mode 100644 tools/testing/selftests/mm/guard-pages.c diff --git a/tools/testing/selftests/mm/.gitignore b/tools/testing/selftests/mm/.gitignore index 689bbd5202964..8f01f4da1c0de 100644 --- a/tools/testing/selftests/mm/.gitignore +++ b/tools/testing/selftests/mm/.gitignore @@ -54,3 +54,4 @@ droppable hugetlb_dio pkey_sighandler_tests_32 pkey_sighandler_tests_64 +guard-pages diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 02e1204971b0a..15c734d6cfec5 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -79,6 +79,7 @@ TEST_GEN_FILES += hugetlb_fault_after_madv TEST_GEN_FILES += hugetlb_madv_vs_map TEST_GEN_FILES += hugetlb_dio TEST_GEN_FILES += droppable +TEST_GEN_FILES += guard-pages ifneq ($(ARCH),arm64) TEST_GEN_FILES += soft-dirty diff --git a/tools/testing/selftests/mm/guard-pages.c b/tools/testing/selftests/mm/guard-pages.c new file mode 100644 index 0000000000000..7cdf815d0d63b --- /dev/null +++ b/tools/testing/selftests/mm/guard-pages.c @@ -0,0 +1,1243 @@ +// SPDX-License-Identifier: GPL-2.0-or-later + +#define _GNU_SOURCE +#include "../kselftest_harness.h" +#include /* Force the import of the tools version. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Ignore the checkpatch warning, as per the C99 standard, section 7.14.1.1: + * + * "If the signal occurs other than as the result of calling the abort or raise + * function, the behavior is undefined if the signal handler refers to any + * object with static storage duration other than by assigning a value to an + * object declared as volatile sig_atomic_t" + */ +static volatile sig_atomic_t signal_jump_set; +static sigjmp_buf signal_jmp_buf; + +/* + * Ignore the checkpatch warning, we must read from x but don't want to do + * anything with it in order to trigger a read page fault. We therefore must use + * volatile to stop the compiler from optimising this away. + */ +#define FORCE_READ(x) (*(volatile typeof(x) *)x) + +static int userfaultfd(int flags) +{ + return syscall(SYS_userfaultfd, flags); +} + +static void handle_fatal(int c) +{ + if (!signal_jump_set) + return; + + siglongjmp(signal_jmp_buf, c); +} + +static int pidfd_open(pid_t pid, unsigned int flags) +{ + return syscall(SYS_pidfd_open, pid, flags); +} + +/* + * Enable our signal catcher and try to read/write the specified buffer. The + * return value indicates whether the read/write succeeds without a fatal + * signal. + */ +static bool try_access_buf(char *ptr, bool write) +{ + bool failed; + + /* Tell signal handler to jump back here on fatal signal. */ + signal_jump_set = true; + /* If a fatal signal arose, we will jump back here and failed is set. */ + failed = sigsetjmp(signal_jmp_buf, 0) != 0; + + if (!failed) { + if (write) + *ptr = 'x'; + else + FORCE_READ(ptr); + } + + signal_jump_set = false; + return !failed; +} + +/* Try and read from a buffer, return true if no fatal signal. */ +static bool try_read_buf(char *ptr) +{ + return try_access_buf(ptr, false); +} + +/* Try and write to a buffer, return true if no fatal signal. */ +static bool try_write_buf(char *ptr) +{ + return try_access_buf(ptr, true); +} + +/* + * Try and BOTH read from AND write to a buffer, return true if BOTH operations + * succeed. + */ +static bool try_read_write_buf(char *ptr) +{ + return try_read_buf(ptr) && try_write_buf(ptr); +} + +FIXTURE(guard_pages) +{ + unsigned long page_size; +}; + +FIXTURE_SETUP(guard_pages) +{ + struct sigaction act = { + .sa_handler = &handle_fatal, + .sa_flags = SA_NODEFER, + }; + + sigemptyset(&act.sa_mask); + if (sigaction(SIGSEGV, &act, NULL)) + ksft_exit_fail_perror("sigaction"); + + self->page_size = (unsigned long)sysconf(_SC_PAGESIZE); +}; + +FIXTURE_TEARDOWN(guard_pages) +{ + struct sigaction act = { + .sa_handler = SIG_DFL, + .sa_flags = SA_NODEFER, + }; + + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); +} + +TEST_F(guard_pages, basic) +{ + const unsigned long NUM_PAGES = 10; + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + ptr = mmap(NULL, NUM_PAGES * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANON, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Trivially assert we can touch the first page. */ + ASSERT_TRUE(try_read_write_buf(ptr)); + + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + + /* Establish that 1st page SIGSEGV's. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + + /* Ensure we can touch everything else.*/ + for (i = 1; i < NUM_PAGES; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Establish a guard page at the end of the mapping. */ + ASSERT_EQ(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size, + MADV_GUARD_INSTALL), 0); + + /* Check that both guard pages result in SIGSEGV. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size])); + + /* Remove the first guard page. */ + ASSERT_FALSE(madvise(ptr, page_size, MADV_GUARD_REMOVE)); + + /* Make sure we can touch it. */ + ASSERT_TRUE(try_read_write_buf(ptr)); + + /* Remove the last guard page. */ + ASSERT_FALSE(madvise(&ptr[(NUM_PAGES - 1) * page_size], page_size, + MADV_GUARD_REMOVE)); + + /* Make sure we can touch it. */ + ASSERT_TRUE(try_read_write_buf(&ptr[(NUM_PAGES - 1) * page_size])); + + /* + * Test setting a _range_ of pages, namely the first 3. The first of + * these be faulted in, so this also tests that we can install guard + * pages over backed pages. + */ + ASSERT_EQ(madvise(ptr, 3 * page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure they are all guard pages. */ + for (i = 0; i < 3; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Make sure the rest are not. */ + for (i = 3; i < NUM_PAGES; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Remove guard pages. */ + ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0); + + /* Now make sure we can touch everything. */ + for (i = 0; i < NUM_PAGES; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* + * Now remove all guard pages, make sure we don't remove existing + * entries. + */ + ASSERT_EQ(madvise(ptr, NUM_PAGES * page_size, MADV_GUARD_REMOVE), 0); + + for (i = 0; i < NUM_PAGES * page_size; i += page_size) { + char chr = ptr[i]; + + ASSERT_EQ(chr, 'x'); + } + + ASSERT_EQ(munmap(ptr, NUM_PAGES * page_size), 0); +} + +/* Assert that operations applied across multiple VMAs work as expected. */ +TEST_F(guard_pages, multi_vma) +{ + const unsigned long page_size = self->page_size; + char *ptr_region, *ptr, *ptr1, *ptr2, *ptr3; + int i; + + /* Reserve a 100 page region over which we can install VMAs. */ + ptr_region = mmap(NULL, 100 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_region, MAP_FAILED); + + /* Place a VMA of 10 pages size at the start of the region. */ + ptr1 = mmap(ptr_region, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr1, MAP_FAILED); + + /* Place a VMA of 5 pages size 50 pages into the region. */ + ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + + /* Place a VMA of 20 pages size at the end of the region. */ + ptr3 = mmap(&ptr_region[80 * page_size], 20 * page_size, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr3, MAP_FAILED); + + /* Unmap gaps. */ + ASSERT_EQ(munmap(&ptr_region[10 * page_size], 40 * page_size), 0); + ASSERT_EQ(munmap(&ptr_region[55 * page_size], 25 * page_size), 0); + + /* + * We end up with VMAs like this: + * + * 0 10 .. 50 55 .. 80 100 + * [---] [---] [---] + */ + + /* + * Now mark the whole range as guard pages and make sure all VMAs are as + * such. + */ + + /* + * madvise() is certifiable and lets you perform operations over gaps, + * everything works, but it indicates an error and errno is set to + * -ENOMEM. Also if anything runs out of memory it is set to + * -ENOMEM. You are meant to guess which is which. + */ + ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), -1); + ASSERT_EQ(errno, ENOMEM); + + for (i = 0; i < 10; i++) { + char *curr = &ptr1[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + for (i = 0; i < 5; i++) { + char *curr = &ptr2[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + for (i = 0; i < 20; i++) { + char *curr = &ptr3[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Now remove guar pages over range and assert the opposite. */ + + ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), -1); + ASSERT_EQ(errno, ENOMEM); + + for (i = 0; i < 10; i++) { + char *curr = &ptr1[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + for (i = 0; i < 5; i++) { + char *curr = &ptr2[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + for (i = 0; i < 20; i++) { + char *curr = &ptr3[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Now map incompatible VMAs in the gaps. */ + ptr = mmap(&ptr_region[10 * page_size], 40 * page_size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + ptr = mmap(&ptr_region[55 * page_size], 25 * page_size, + PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * We end up with VMAs like this: + * + * 0 10 .. 50 55 .. 80 100 + * [---][xxxx][---][xxxx][---] + * + * Where 'x' signifies VMAs that cannot be merged with those adjacent to + * them. + */ + + /* Multiple VMAs adjacent to one another should result in no error. */ + ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_INSTALL), 0); + for (i = 0; i < 100; i++) { + char *curr = &ptr_region[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + ASSERT_EQ(madvise(ptr_region, 100 * page_size, MADV_GUARD_REMOVE), 0); + for (i = 0; i < 100; i++) { + char *curr = &ptr_region[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr_region, 100 * page_size), 0); +} + +/* + * Assert that batched operations performed using process_madvise() work as + * expected. + */ +TEST_F(guard_pages, process_madvise) +{ + const unsigned long page_size = self->page_size; + pid_t pid = getpid(); + int pidfd = pidfd_open(pid, 0); + char *ptr_region, *ptr1, *ptr2, *ptr3; + ssize_t count; + struct iovec vec[6]; + + ASSERT_NE(pidfd, -1); + + /* Reserve region to map over. */ + ptr_region = mmap(NULL, 100 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_region, MAP_FAILED); + + /* + * 10 pages offset 1 page into reserve region. We MAP_POPULATE so we + * overwrite existing entries and test this code path against + * overwriting existing entries. + */ + ptr1 = mmap(&ptr_region[page_size], 10 * page_size, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE | MAP_POPULATE, -1, 0); + ASSERT_NE(ptr1, MAP_FAILED); + /* We want guard markers at start/end of each VMA. */ + vec[0].iov_base = ptr1; + vec[0].iov_len = page_size; + vec[1].iov_base = &ptr1[9 * page_size]; + vec[1].iov_len = page_size; + + /* 5 pages offset 50 pages into reserve region. */ + ptr2 = mmap(&ptr_region[50 * page_size], 5 * page_size, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr2, MAP_FAILED); + vec[2].iov_base = ptr2; + vec[2].iov_len = page_size; + vec[3].iov_base = &ptr2[4 * page_size]; + vec[3].iov_len = page_size; + + /* 20 pages offset 79 pages into reserve region. */ + ptr3 = mmap(&ptr_region[79 * page_size], 20 * page_size, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr3, MAP_FAILED); + vec[4].iov_base = ptr3; + vec[4].iov_len = page_size; + vec[5].iov_base = &ptr3[19 * page_size]; + vec[5].iov_len = page_size; + + /* Free surrounding VMAs. */ + ASSERT_EQ(munmap(ptr_region, page_size), 0); + ASSERT_EQ(munmap(&ptr_region[11 * page_size], 39 * page_size), 0); + ASSERT_EQ(munmap(&ptr_region[55 * page_size], 24 * page_size), 0); + ASSERT_EQ(munmap(&ptr_region[99 * page_size], page_size), 0); + + /* Now guard in one step. */ + count = process_madvise(pidfd, vec, 6, MADV_GUARD_INSTALL, 0); + + /* OK we don't have permission to do this, skip. */ + if (count == -1 && errno == EPERM) + ksft_exit_skip("No process_madvise() permissions, try running as root.\n"); + + /* Returns the number of bytes advised. */ + ASSERT_EQ(count, 6 * page_size); + + /* Now make sure the guarding was applied. */ + + ASSERT_FALSE(try_read_write_buf(ptr1)); + ASSERT_FALSE(try_read_write_buf(&ptr1[9 * page_size])); + + ASSERT_FALSE(try_read_write_buf(ptr2)); + ASSERT_FALSE(try_read_write_buf(&ptr2[4 * page_size])); + + ASSERT_FALSE(try_read_write_buf(ptr3)); + ASSERT_FALSE(try_read_write_buf(&ptr3[19 * page_size])); + + /* Now do the same with unguard... */ + count = process_madvise(pidfd, vec, 6, MADV_GUARD_REMOVE, 0); + + /* ...and everything should now succeed. */ + + ASSERT_TRUE(try_read_write_buf(ptr1)); + ASSERT_TRUE(try_read_write_buf(&ptr1[9 * page_size])); + + ASSERT_TRUE(try_read_write_buf(ptr2)); + ASSERT_TRUE(try_read_write_buf(&ptr2[4 * page_size])); + + ASSERT_TRUE(try_read_write_buf(ptr3)); + ASSERT_TRUE(try_read_write_buf(&ptr3[19 * page_size])); + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr1, 10 * page_size), 0); + ASSERT_EQ(munmap(ptr2, 5 * page_size), 0); + ASSERT_EQ(munmap(ptr3, 20 * page_size), 0); + close(pidfd); +} + +/* Assert that unmapping ranges does not leave guard markers behind. */ +TEST_F(guard_pages, munmap) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr_new1, *ptr_new2; + + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard first and last pages. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + ASSERT_EQ(madvise(&ptr[9 * page_size], page_size, MADV_GUARD_INSTALL), 0); + + /* Assert that they are guarded. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[9 * page_size])); + + /* Unmap them. */ + ASSERT_EQ(munmap(ptr, page_size), 0); + ASSERT_EQ(munmap(&ptr[9 * page_size], page_size), 0); + + /* Map over them.*/ + ptr_new1 = mmap(ptr, page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new1, MAP_FAILED); + ptr_new2 = mmap(&ptr[9 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new2, MAP_FAILED); + + /* Assert that they are now not guarded. */ + ASSERT_TRUE(try_read_write_buf(ptr_new1)); + ASSERT_TRUE(try_read_write_buf(ptr_new2)); + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Assert that mprotect() operations have no bearing on guard markers. */ +TEST_F(guard_pages, mprotect) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard the middle of the range. */ + ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size, + MADV_GUARD_INSTALL), 0); + + /* Assert that it is indeed guarded. */ + ASSERT_FALSE(try_read_write_buf(&ptr[5 * page_size])); + ASSERT_FALSE(try_read_write_buf(&ptr[6 * page_size])); + + /* Now make these pages read-only. */ + ASSERT_EQ(mprotect(&ptr[5 * page_size], 2 * page_size, PROT_READ), 0); + + /* Make sure the range is still guarded. */ + ASSERT_FALSE(try_read_buf(&ptr[5 * page_size])); + ASSERT_FALSE(try_read_buf(&ptr[6 * page_size])); + + /* Make sure we can guard again without issue.*/ + ASSERT_EQ(madvise(&ptr[5 * page_size], 2 * page_size, + MADV_GUARD_INSTALL), 0); + + /* Make sure the range is, yet again, still guarded. */ + ASSERT_FALSE(try_read_buf(&ptr[5 * page_size])); + ASSERT_FALSE(try_read_buf(&ptr[6 * page_size])); + + /* Now unguard the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* Make sure the whole range is readable. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Split and merge VMAs and make sure guard pages still behave. */ +TEST_F(guard_pages, split_merge) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr_new; + int i; + + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure the whole range is guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Now unmap some pages in the range so we split. */ + ASSERT_EQ(munmap(&ptr[2 * page_size], page_size), 0); + ASSERT_EQ(munmap(&ptr[5 * page_size], page_size), 0); + ASSERT_EQ(munmap(&ptr[8 * page_size], page_size), 0); + + /* Make sure the remaining ranges are guarded post-split. */ + for (i = 0; i < 2; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + for (i = 2; i < 5; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + for (i = 6; i < 8; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + for (i = 9; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Now map them again - the unmap will have cleared the guards. */ + ptr_new = mmap(&ptr[2 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new, MAP_FAILED); + ptr_new = mmap(&ptr[5 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new, MAP_FAILED); + ptr_new = mmap(&ptr[8 * page_size], page_size, PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new, MAP_FAILED); + + /* Now make sure guard pages are established. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_write_buf(curr); + bool expect_true = i == 2 || i == 5 || i == 8; + + ASSERT_TRUE(expect_true ? result : !result); + } + + /* Now guard everything again. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure the whole range is guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Now split the range into three. */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0); + + /* Make sure the whole range is guarded for read. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_buf(curr)); + } + + /* Now reset protection bits so we merge the whole thing. */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0); + ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE), 0); + + /* Make sure the whole range is still guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Split range into 3 again... */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ), 0); + ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, PROT_READ), 0); + + /* ...and unguard the whole range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* Make sure the whole range is remedied for read. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_buf(curr)); + } + + /* Merge them again. */ + ASSERT_EQ(mprotect(ptr, 3 * page_size, PROT_READ | PROT_WRITE), 0); + ASSERT_EQ(mprotect(&ptr[7 * page_size], 3 * page_size, + PROT_READ | PROT_WRITE), 0); + + /* Now ensure the merged range is remedied for read/write. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Assert that MADV_DONTNEED does not remove guard markers. */ +TEST_F(guard_pages, dontneed) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Back the whole range. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + *curr = 'y'; + } + + /* Guard every other page. */ + for (i = 0; i < 10; i += 2) { + char *curr = &ptr[i * page_size]; + int res = madvise(curr, page_size, MADV_GUARD_INSTALL); + + ASSERT_EQ(res, 0); + } + + /* Indicate that we don't need any of the range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_DONTNEED), 0); + + /* Check to ensure guard markers are still in place. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_buf(curr); + + if (i % 2 == 0) { + ASSERT_FALSE(result); + } else { + ASSERT_TRUE(result); + /* Make sure we really did get reset to zero page. */ + ASSERT_EQ(*curr, '\0'); + } + + /* Now write... */ + result = try_write_buf(&ptr[i * page_size]); + + /* ...and make sure same result. */ + ASSERT_TRUE(i % 2 != 0 ? result : !result); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Assert that mlock()'ed pages work correctly with guard markers. */ +TEST_F(guard_pages, mlock) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Populate. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + *curr = 'y'; + } + + /* Lock. */ + ASSERT_EQ(mlock(ptr, 10 * page_size), 0); + + /* Now try to guard, should fail with EINVAL. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), -1); + ASSERT_EQ(errno, EINVAL); + + /* OK unlock. */ + ASSERT_EQ(munlock(ptr, 10 * page_size), 0); + + /* Guard first half of range, should now succeed. */ + ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure guard works. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_write_buf(curr); + + if (i < 5) { + ASSERT_FALSE(result); + } else { + ASSERT_TRUE(result); + ASSERT_EQ(*curr, 'x'); + } + } + + /* + * Now lock the latter part of the range. We can't lock the guard pages, + * as this would result in the pages being populated and the guarding + * would cause this to error out. + */ + ASSERT_EQ(mlock(&ptr[5 * page_size], 5 * page_size), 0); + + /* + * Now remove guard pages, we permit mlock()'d ranges to have guard + * pages removed as it is a non-destructive operation. + */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + /* Now check that no guard pages remain. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* + * Assert that moving, extending and shrinking memory via mremap() retains + * guard markers where possible. + * + * - Moving a mapping alone should retain markers as they are. + */ +TEST_F(guard_pages, mremap_move) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr_new; + + /* Map 5 pages. */ + ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Place guard markers at both ends of the 5 page span. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure the guard pages are in effect. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size])); + + /* Map a new region we will move this range into. Doing this ensures + * that we have reserved a range to map into. + */ + ptr_new = mmap(NULL, 5 * page_size, PROT_NONE, MAP_ANON | MAP_PRIVATE, + -1, 0); + ASSERT_NE(ptr_new, MAP_FAILED); + + ASSERT_EQ(mremap(ptr, 5 * page_size, 5 * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new), ptr_new); + + /* Make sure the guard markers are retained. */ + ASSERT_FALSE(try_read_write_buf(ptr_new)); + ASSERT_FALSE(try_read_write_buf(&ptr_new[4 * page_size])); + + /* + * Clean up - we only need reference the new pointer as we overwrote the + * PROT_NONE range and moved the existing one. + */ + munmap(ptr_new, 5 * page_size); +} + +/* + * Assert that moving, extending and shrinking memory via mremap() retains + * guard markers where possible. + * + * Expanding should retain guard pages, only now in different position. The user + * will have to remove guard pages manually to fix up (they'd have to do the + * same if it were a PROT_NONE mapping). + */ +TEST_F(guard_pages, mremap_expand) +{ + const unsigned long page_size = self->page_size; + char *ptr, *ptr_new; + + /* Map 10 pages... */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + /* ...But unmap the last 5 so we can ensure we can expand into them. */ + ASSERT_EQ(munmap(&ptr[5 * page_size], 5 * page_size), 0); + + /* Place guard markers at both ends of the 5 page span. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure the guarding is in effect. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size])); + + /* Now expand to 10 pages. */ + ptr = mremap(ptr, 5 * page_size, 10 * page_size, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* + * Make sure the guard markers are retained in their original positions. + */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size])); + + /* Reserve a region which we can move to and expand into. */ + ptr_new = mmap(NULL, 20 * page_size, PROT_NONE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr_new, MAP_FAILED); + + /* Now move and expand into it. */ + ptr = mremap(ptr, 10 * page_size, 20 * page_size, + MREMAP_MAYMOVE | MREMAP_FIXED, ptr_new); + ASSERT_EQ(ptr, ptr_new); + + /* + * Again, make sure the guard markers are retained in their original positions. + */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size])); + + /* + * A real user would have to remove guard markers, but would reasonably + * expect all characteristics of the mapping to be retained, including + * guard markers. + */ + + /* Cleanup. */ + munmap(ptr, 20 * page_size); +} +/* + * Assert that moving, extending and shrinking memory via mremap() retains + * guard markers where possible. + * + * Shrinking will result in markers that are shrunk over being removed. Again, + * if the user were using a PROT_NONE mapping they'd have to manually fix this + * up also so this is OK. + */ +TEST_F(guard_pages, mremap_shrink) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + /* Map 5 pages. */ + ptr = mmap(NULL, 5 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Place guard markers at both ends of the 5 page span. */ + ASSERT_EQ(madvise(ptr, page_size, MADV_GUARD_INSTALL), 0); + ASSERT_EQ(madvise(&ptr[4 * page_size], page_size, MADV_GUARD_INSTALL), 0); + + /* Make sure the guarding is in effect. */ + ASSERT_FALSE(try_read_write_buf(ptr)); + ASSERT_FALSE(try_read_write_buf(&ptr[4 * page_size])); + + /* Now shrink to 3 pages. */ + ptr = mremap(ptr, 5 * page_size, 3 * page_size, MREMAP_MAYMOVE); + ASSERT_NE(ptr, MAP_FAILED); + + /* We expect the guard marker at the start to be retained... */ + ASSERT_FALSE(try_read_write_buf(ptr)); + + /* ...But remaining pages will not have guard markers. */ + for (i = 1; i < 3; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* + * As with expansion, a real user would have to remove guard pages and + * fixup. But you'd have to do similar manual things with PROT_NONE + * mappings too. + */ + + /* + * If we expand back to the original size, the end marker will, of + * course, no longer be present. + */ + ptr = mremap(ptr, 3 * page_size, 5 * page_size, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Again, we expect the guard marker at the start to be retained... */ + ASSERT_FALSE(try_read_write_buf(ptr)); + + /* ...But remaining pages will not have guard markers. */ + for (i = 1; i < 5; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + munmap(ptr, 5 * page_size); +} + +/* + * Assert that forking a process with VMAs that do not have VM_WIPEONFORK set + * retain guard pages. + */ +TEST_F(guard_pages, fork) +{ + const unsigned long page_size = self->page_size; + char *ptr; + pid_t pid; + int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Establish guard apges in the first 5 pages. */ + ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); + + pid = fork(); + ASSERT_NE(pid, -1); + if (!pid) { + /* This is the child process now. */ + + /* Assert that the guarding is in effect. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_write_buf(curr); + + ASSERT_TRUE(i >= 5 ? result : !result); + } + + /* Now unguard the range.*/ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_REMOVE), 0); + + exit(0); + } + + /* Parent process. */ + + /* Parent simply waits on child. */ + waitpid(pid, NULL, 0); + + /* Child unguard does not impact parent page table state. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_write_buf(curr); + + ASSERT_TRUE(i >= 5 ? result : !result); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* + * Assert that forking a process with VMAs that do have VM_WIPEONFORK set + * behave as expected. + */ +TEST_F(guard_pages, fork_wipeonfork) +{ + const unsigned long page_size = self->page_size; + char *ptr; + pid_t pid; + int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Mark wipe on fork. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_WIPEONFORK), 0); + + /* Guard the first 5 pages. */ + ASSERT_EQ(madvise(ptr, 5 * page_size, MADV_GUARD_INSTALL), 0); + + pid = fork(); + ASSERT_NE(pid, -1); + if (!pid) { + /* This is the child process now. */ + + /* Guard will have been wiped. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_TRUE(try_read_write_buf(curr)); + } + + exit(0); + } + + /* Parent process. */ + + waitpid(pid, NULL, 0); + + /* Guard markers should be in effect.*/ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + bool result = try_read_write_buf(curr); + + ASSERT_TRUE(i >= 5 ? result : !result); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Ensure that MADV_FREE retains guard entries as expected. */ +TEST_F(guard_pages, lazyfree) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* Ensure guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Lazyfree range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_FREE), 0); + + /* This should leave the guard markers in place. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Ensure that MADV_POPULATE_READ, MADV_POPULATE_WRITE behave as expected. */ +TEST_F(guard_pages, populate) +{ + const unsigned long page_size = self->page_size; + char *ptr; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* Populate read should error out... */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_READ), -1); + ASSERT_EQ(errno, EFAULT); + + /* ...as should populate write. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_POPULATE_WRITE), -1); + ASSERT_EQ(errno, EFAULT); + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Ensure that MADV_COLD, MADV_PAGEOUT do not remove guard markers. */ +TEST_F(guard_pages, cold_pageout) +{ + const unsigned long page_size = self->page_size; + char *ptr; + int i; + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Guard range. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* Ensured guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Now mark cold. This should have no impact on guard markers. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_COLD), 0); + + /* Should remain guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* OK, now page out. This should equally, have no effect on markers. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_PAGEOUT), 0); + + /* Should remain guarded. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +/* Ensure that guard pages do not break userfaultd. */ +TEST_F(guard_pages, uffd) +{ + const unsigned long page_size = self->page_size; + int uffd; + char *ptr; + int i; + struct uffdio_api api = { + .api = UFFD_API, + .features = 0, + }; + struct uffdio_register reg; + struct uffdio_range range; + + /* Set up uffd. */ + uffd = userfaultfd(0); + if (uffd == -1 && errno == EPERM) + ksft_exit_skip("No userfaultfd permissions, try running as root.\n"); + ASSERT_NE(uffd, -1); + + ASSERT_EQ(ioctl(uffd, UFFDIO_API, &api), 0); + + /* Map 10 pages. */ + ptr = mmap(NULL, 10 * page_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + ASSERT_NE(ptr, MAP_FAILED); + + /* Register the range with uffd. */ + range.start = (unsigned long)ptr; + range.len = 10 * page_size; + reg.range = range; + reg.mode = UFFDIO_REGISTER_MODE_MISSING; + ASSERT_EQ(ioctl(uffd, UFFDIO_REGISTER, ®), 0); + + /* Guard the range. This should not trigger the uffd. */ + ASSERT_EQ(madvise(ptr, 10 * page_size, MADV_GUARD_INSTALL), 0); + + /* The guarding should behave as usual with no uffd intervention. */ + for (i = 0; i < 10; i++) { + char *curr = &ptr[i * page_size]; + + ASSERT_FALSE(try_read_write_buf(curr)); + } + + /* Cleanup. */ + ASSERT_EQ(ioctl(uffd, UFFDIO_UNREGISTER, &range), 0); + close(uffd); + ASSERT_EQ(munmap(ptr, 10 * page_size), 0); +} + +TEST_HARNESS_MAIN From ab6e8e74e47362bd9d79dd4394a167b2afe0cc77 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 27 Oct 2024 13:14:42 -0700 Subject: [PATCH 176/215] mm: delete the unused put_pages_list() The last user of put_pages_list() converted away from it in 6.10 commit 06c375053cef ("iommu/vt-d: add wrapper functions for page allocations"): delete put_pages_list(). Link: https://lkml.kernel.org/r/d9985d6a-293e-176b-e63d-82fdfd28c139@google.com Signed-off-by: Hugh Dickins Acked-by: Peter Xu Acked-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Matthew Wilcox (Oracle) Cc: Pasha Tatashin Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 -- mm/swap.c | 31 ------------------------------- 2 files changed, 33 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 78848fbefe94e..32888a97ab44e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1286,8 +1286,6 @@ static inline struct folio *virt_to_folio(const void *x) void __folio_put(struct folio *folio); -void put_pages_list(struct list_head *pages); - void split_page(struct page *page, unsigned int order); void folio_copy(struct folio *dst, struct folio *src); int folio_mc_copy(struct folio *dst, struct folio *src); diff --git a/mm/swap.c b/mm/swap.c index b8e3259ea2c47..638a3f001676f 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -127,37 +127,6 @@ void __folio_put(struct folio *folio) } EXPORT_SYMBOL(__folio_put); -/** - * put_pages_list() - release a list of pages - * @pages: list of pages threaded on page->lru - * - * Release a list of pages which are strung together on page.lru. - */ -void put_pages_list(struct list_head *pages) -{ - struct folio_batch fbatch; - struct folio *folio, *next; - - folio_batch_init(&fbatch); - list_for_each_entry_safe(folio, next, pages, lru) { - if (!folio_put_testzero(folio)) - continue; - if (folio_test_hugetlb(folio)) { - free_huge_folio(folio); - continue; - } - /* LRU flag must be clear because it's passed using the lru */ - if (folio_batch_add(&fbatch, folio) > 0) - continue; - free_unref_folios(&fbatch); - } - - if (fbatch.nr) - free_unref_folios(&fbatch); - INIT_LIST_HEAD(pages); -} -EXPORT_SYMBOL(put_pages_list); - typedef void (*move_fn_t)(struct lruvec *lruvec, struct folio *folio); static void lru_add(struct lruvec *lruvec, struct folio *folio) From e1479b880cb213057c48dc8b5fb1a8a64e04f0eb Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 28 Oct 2024 19:11:05 -0700 Subject: [PATCH 177/215] memcg: rename do_flush_stats and add force flag Patch series "memcg: tracepoint for flushing stats", v3. This series adds new capability for understanding frequency and circumstances behind flushing memcg stats. This patch (of 2): Change the name to something more consistent with others in the file and use double unders to signify it is associated with the mem_cgroup_flush_stats() API call. Additionally include a new flag that call sites use to indicate a forced flush; skipping checks and flushing unconditionally. There are no changes in functionality. Link: https://lkml.kernel.org/r/20241029021106.25587-1-inwardvessel@gmail.com Link: https://lkml.kernel.org/r/20241029021106.25587-2-inwardvessel@gmail.com Signed-off-by: JP Kobryn Reviewed-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3402b6d13bc2f..a4da834b4aa32 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -595,8 +595,11 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) } } -static void do_flush_stats(struct mem_cgroup *memcg) +static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { + if (!force && !memcg_vmstats_needs_flush(memcg->vmstats)) + return; + if (mem_cgroup_is_root(memcg)) WRITE_ONCE(flush_last_time, jiffies_64); @@ -620,8 +623,7 @@ void mem_cgroup_flush_stats(struct mem_cgroup *memcg) if (!memcg) memcg = root_mem_cgroup; - if (memcg_vmstats_needs_flush(memcg->vmstats)) - do_flush_stats(memcg); + __mem_cgroup_flush_stats(memcg, false); } void mem_cgroup_flush_stats_ratelimited(struct mem_cgroup *memcg) @@ -637,7 +639,7 @@ static void flush_memcg_stats_dwork(struct work_struct *w) * Deliberately ignore memcg_vmstats_needs_flush() here so that flushing * in latency-sensitive paths is as cheap as possible. */ - do_flush_stats(root_mem_cgroup); + __mem_cgroup_flush_stats(root_mem_cgroup, true); queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } @@ -5286,11 +5288,8 @@ bool obj_cgroup_may_zswap(struct obj_cgroup *objcg) break; } - /* - * mem_cgroup_flush_stats() ignores small changes. Use - * do_flush_stats() directly to get accurate stats for charging. - */ - do_flush_stats(memcg); + /* Force flush to get accurate stats for charging */ + __mem_cgroup_flush_stats(memcg, true); pages = memcg_page_state(memcg, MEMCG_ZSWAP_B) / PAGE_SIZE; if (pages < max) continue; From f914ac96ee8828368f5a24553e75216d76da0b42 Mon Sep 17 00:00:00 2001 From: JP Kobryn Date: Mon, 28 Oct 2024 19:11:06 -0700 Subject: [PATCH 178/215] memcg: add flush tracepoint This tracepoint gives visibility on how often the flushing of memcg stats occurs and contains info on whether it was forced, skipped, and the value of stats updated. It can help with understanding how readers are affected by having to perform the flush, and the effectiveness of the flush by inspecting the number of stats updated. Paired with the recently added tracepoints for tracing rstat updates, it can also help show correlation where stats exceed thresholds frequently. Link: https://lkml.kernel.org/r/20241029021106.25587-3-inwardvessel@gmail.com Signed-off-by: JP Kobryn Reviewed-by: Yosry Ahmed Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Steven Rostedt (Google) Signed-off-by: Andrew Morton --- include/trace/events/memcg.h | 25 +++++++++++++++++++++++++ mm/memcontrol.c | 7 ++++++- 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/include/trace/events/memcg.h b/include/trace/events/memcg.h index 8667e57816d21..dfe2f51019b4c 100644 --- a/include/trace/events/memcg.h +++ b/include/trace/events/memcg.h @@ -74,6 +74,31 @@ DEFINE_EVENT(memcg_rstat_events, count_memcg_events, TP_ARGS(memcg, item, val) ); +TRACE_EVENT(memcg_flush_stats, + + TP_PROTO(struct mem_cgroup *memcg, s64 stats_updates, + bool force, bool needs_flush), + + TP_ARGS(memcg, stats_updates, force, needs_flush), + + TP_STRUCT__entry( + __field(u64, id) + __field(s64, stats_updates) + __field(bool, force) + __field(bool, needs_flush) + ), + + TP_fast_assign( + __entry->id = cgroup_id(memcg->css.cgroup); + __entry->stats_updates = stats_updates; + __entry->force = force; + __entry->needs_flush = needs_flush; + ), + + TP_printk("memcg_id=%llu stats_updates=%lld force=%d needs_flush=%d", + __entry->id, __entry->stats_updates, + __entry->force, __entry->needs_flush) +); #endif /* _TRACE_MEMCG_H */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a4da834b4aa32..6486e96528434 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -597,7 +597,12 @@ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val) static void __mem_cgroup_flush_stats(struct mem_cgroup *memcg, bool force) { - if (!force && !memcg_vmstats_needs_flush(memcg->vmstats)) + bool needs_flush = memcg_vmstats_needs_flush(memcg->vmstats); + + trace_memcg_flush_stats(memcg, atomic64_read(&memcg->vmstats->stats_updates), + force, needs_flush); + + if (!force && !needs_flush) return; if (mem_cgroup_is_root(memcg)) From 408a8dc6232294ac83f233f869f425725765d2e1 Mon Sep 17 00:00:00 2001 From: zhangguopeng Date: Tue, 29 Oct 2024 18:18:53 +0800 Subject: [PATCH 179/215] mm/memory-failure: replace sprintf() with sysfs_emit() As Documentation/filesystems/sysfs.rst suggested, show() should only use sysfs_emit() or sysfs_emit_at() when formatting the value to be returned to user space. Link: https://lkml.kernel.org/r/20241029101853.37890-1-zhangguopeng@kylinos.cn Signed-off-by: zhangguopeng Acked-by: Miaohe Lin Cc: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 7ce7ba8586f5a..a7b8ccd29b6f5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -100,7 +100,7 @@ static ssize_t _name##_show(struct device *dev, \ { \ struct memory_failure_stats *mf_stats = \ &NODE_DATA(dev->id)->mf_stats; \ - return sprintf(buf, "%lu\n", mf_stats->_name); \ + return sysfs_emit(buf, "%lu\n", mf_stats->_name); \ } \ static DEVICE_ATTR_RO(_name) From f85219096648b251a81e9fe24a1974590cfc417d Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 29 Oct 2024 00:36:14 +0900 Subject: [PATCH 180/215] zram: clear IDLE flag after recompression Patch series "zram: IDLE flag handling fixes", v2. zram can wrongly preserve ZRAM_IDLE flag on its entries which can result in premature post-processing (writeback and recompression) of such entries. This patch (of 2) Recompression should clear ZRAM_IDLE flag on the entries it has accessed, because otherwise some entries, specifically those for which recompression has failed, become immediate candidate entries for another post-processing (e.g. writeback). Consider the following case: - recompression marks entries IDLE every 4 hours and attempts to recompress them - some entries are incompressible, so we keep them intact and hence preserve IDLE flag - writeback marks entries IDLE every 8 hours and writebacks IDLE entries, however we have IDLE entries left from recompression, so writeback prematurely writebacks those entries. The bug was reported by Shin Kawamura. Link: https://lkml.kernel.org/r/20241028153629.1479791-1-senozhatsky@chromium.org Link: https://lkml.kernel.org/r/20241028153629.1479791-2-senozhatsky@chromium.org Fixes: 84b33bf78889 ("zram: introduce recompress sysfs knob") Signed-off-by: Sergey Senozhatsky Reported-by: Shin Kawamura Acked-by: Brian Geffon Cc: Minchan Kim Signed-off-by: Andrew Morton Cc: --- drivers/block/zram/zram_drv.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e6d12e81241d8..a16dbffcdca30 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1864,6 +1864,13 @@ static int recompress_slot(struct zram *zram, u32 index, struct page *page, if (ret) return ret; + /* + * We touched this entry so mark it as non-IDLE. This makes sure that + * we don't preserve IDLE flag and don't incorrectly pick this entry + * for different post-processing type (e.g. writeback). + */ + zram_clear_flag(zram, index, ZRAM_IDLE); + class_index_old = zs_lookup_class_index(zram->mem_pool, comp_len_old); /* * Iterate the secondary comp algorithms list (in order of priority) From d37da422edb0664a2037e6d7d42fe6d339aae78a Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 29 Oct 2024 00:36:15 +0900 Subject: [PATCH 181/215] zram: clear IDLE flag in mark_idle() If entry does not fulfill current mark_idle() parameters, e.g. cutoff time, then we should clear its ZRAM_IDLE from previous mark_idle() invocations. Consider the following case: - mark_idle() cutoff time 8h - mark_idle() cutoff time 4h - writeback() idle - will writeback entries with cutoff time 8h, while it should only pick entries with cutoff time 4h The bug was reported by Shin Kawamura. Link: https://lkml.kernel.org/r/20241028153629.1479791-3-senozhatsky@chromium.org Fixes: 755804d16965 ("zram: introduce an aged idle interface") Signed-off-by: Sergey Senozhatsky Reported-by: Shin Kawamura Acked-by: Brian Geffon Cc: Minchan Kim Signed-off-by: Andrew Morton Cc: --- drivers/block/zram/zram_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index a16dbffcdca30..cee49bb0126d9 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -410,6 +410,8 @@ static void mark_idle(struct zram *zram, ktime_t cutoff) #endif if (is_idle) zram_set_flag(zram, index, ZRAM_IDLE); + else + zram_clear_flag(zram, index, ZRAM_IDLE); zram_slot_unlock(zram, index); } } From e847f8cd96ae808516c1615697b464e6f68c02a4 Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Tue, 8 Oct 2024 17:41:40 +0800 Subject: [PATCH 182/215] selftest/mm: fix typo in virtual_address_range The function name should be *hint* address, so correct it. Link: https://lkml.kernel.org/r/20241008094141.549248-4-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Reviewed-by: Charlie Jenkins Acked-by: Palmer Dabbelt Cc: Alexandre Ghiti Cc: Paul Walmsley Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/virtual_address_range.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/mm/virtual_address_range.c b/tools/testing/selftests/mm/virtual_address_range.c index 4e4c1e311247f..2a2b69e91950a 100644 --- a/tools/testing/selftests/mm/virtual_address_range.c +++ b/tools/testing/selftests/mm/virtual_address_range.c @@ -64,7 +64,7 @@ #define NR_CHUNKS_HIGH NR_CHUNKS_384TB #endif -static char *hind_addr(void) +static char *hint_addr(void) { int bits = HIGH_ADDR_SHIFT + rand() % (63 - HIGH_ADDR_SHIFT); @@ -185,7 +185,7 @@ int main(int argc, char *argv[]) } for (i = 0; i < NR_CHUNKS_HIGH; i++) { - hint = hind_addr(); + hint = hint_addr(); hptr[i] = mmap(hint, MAP_CHUNK_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); From 4175eff0e007b3b781f45742551393736346755d Mon Sep 17 00:00:00 2001 From: Chunyan Zhang Date: Tue, 8 Oct 2024 17:41:41 +0800 Subject: [PATCH 183/215] selftests/mm: skip virtual_address_range tests on riscv RISC-V doesn't currently have the behavior of restricting the virtual address space which virtual_address_range tests check, this will cause the tests fail. So lets disable the whole test suite for riscv64 for now, not build it and run_vmtests.sh will skip it if it is not present. Link: https://lkml.kernel.org/r/20241008094141.549248-5-zhangchunyan@iscas.ac.cn Signed-off-by: Chunyan Zhang Reviewed-by: Charlie Jenkins Acked-by: Palmer Dabbelt Cc: Alexandre Ghiti Cc: Paul Walmsley Cc: Shuah Khan Signed-off-by: Andrew Morton --- tools/testing/selftests/mm/Makefile | 2 ++ tools/testing/selftests/mm/run_vmtests.sh | 10 ++++++---- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/testing/selftests/mm/Makefile b/tools/testing/selftests/mm/Makefile index 15c734d6cfec5..00c6fc6946338 100644 --- a/tools/testing/selftests/mm/Makefile +++ b/tools/testing/selftests/mm/Makefile @@ -116,7 +116,9 @@ endif ifneq (,$(filter $(ARCH),arm64 mips64 parisc64 powerpc riscv64 s390x sparc64 x86_64 s390)) TEST_GEN_FILES += va_high_addr_switch +ifneq ($(ARCH),riscv64) TEST_GEN_FILES += virtual_address_range +endif TEST_GEN_FILES += write_to_hugetlbfs endif diff --git a/tools/testing/selftests/mm/run_vmtests.sh b/tools/testing/selftests/mm/run_vmtests.sh index c5797ad1d37b6..4493bfd1911c9 100755 --- a/tools/testing/selftests/mm/run_vmtests.sh +++ b/tools/testing/selftests/mm/run_vmtests.sh @@ -347,10 +347,12 @@ if [ $VADDR64 -ne 0 ]; then # allows high virtual address allocation requests independent # of platform's physical memory. - prev_policy=$(cat /proc/sys/vm/overcommit_memory) - echo 1 > /proc/sys/vm/overcommit_memory - CATEGORY="hugevm" run_test ./virtual_address_range - echo $prev_policy > /proc/sys/vm/overcommit_memory + if [ -x ./virtual_address_range ]; then + prev_policy=$(cat /proc/sys/vm/overcommit_memory) + echo 1 > /proc/sys/vm/overcommit_memory + CATEGORY="hugevm" run_test ./virtual_address_range + echo $prev_policy > /proc/sys/vm/overcommit_memory + fi # va high address boundary switch test ARCH_ARM64="arm64" From 8e1817b6ba97c3d92d163447226cf6a0c1f90723 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Thu, 31 Oct 2024 15:36:08 -0400 Subject: [PATCH 184/215] vma: detect infinite loop in vma tree There have been no reported infinite loops in the tree, but checking the detection of an infinite loop during validation is simple enough. Add the detection to the validate_mm() function so that error reports are clear and don't just report stalls. This does not protect against internal maple tree issues, but it does detect too many vmas being returned from the tree. The variance of +10 is to allow for the debugging output to be more useful for nearly correct counts. In the event of more than 10 over the map_count, the count will be set to -1 for easier identification of a potential infinite loop. Note that the mmap lock is held to ensure a consistent tree state during the validation process. [akpm@linux-foundation.org: add comment] Link: https://lkml.kernel.org/r/20241031193608.1965366-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: David Hildenbrand Reviewed-by: Vlastimil Babka Reviewed-by: Lorenzo Stoakes Cc: Jann Horn Signed-off-by: Andrew Morton --- mm/vma.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/vma.c b/mm/vma.c index 68138e8c153e6..8a454a7bbc80b 100644 --- a/mm/vma.c +++ b/mm/vma.c @@ -615,7 +615,11 @@ void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } #endif - i++; + /* Check for a infinite loop */ + if (++i > mm->map_count + 10) { + i = -1; + break; + } } if (i != mm->map_count) { pr_emerg("map_count %d vma iterator %d\n", mm->map_count, i); From 04dafdd2082c601f267d68bd48b15b8189d63c29 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:23 +0000 Subject: [PATCH 185/215] maple_tree: print empty for an empty tree on mt_dump() Patch series "refine storing null", v5. When overwriting the whole range with NULL, current behavior is not correct. An empty tree is represented by having the tree point to NULL directly. An empty tree indicates the entire range (0-ULONG_MAX) is NULL. A store operation into an existing node that causes 0 - ULONG_MAX to be equal to NULL may not be restored to an empty state - a node is used to store the single range instead. This is wasteful and different from the initial setup of the tree. Once the tree is using a single node to store 0 - ULONG_MAX, problems may arise when storing more values into a tree with the unexpected state of 0 - ULONG being a single range in a node. User visible issues may mean a corrupt tree and incorrect storage of information within the tree. This would be limited to users who create and then empty a tree by overwriting all values, then try to store more NULLs into the empty tree. I cannot come up with an example of any user doing this (users usually destroy the tree and generally don't keep trying to store NULLs over NULLs), but patch 4/5 "maple_tree: refine mas_store_root() on storing NULL" should be backported just in case. This patch (of 5): Currently for an empty tree, it would print: maple_tree(0x7ffcd02c6ee0) flags 1, height 0 root (nil) 0: (nil) This is a little misleading. Let's print (empty) for an empty tree. Link: https://lkml.kernel.org/r/20241031231627.14316-1-richard.weiyang@gmail.com Link: https://lkml.kernel.org/r/20241031231627.14316-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 38aa8abf8eb81..523355fb2bbe2 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -7273,10 +7273,12 @@ void mt_dump(const struct maple_tree *mt, enum mt_dump_format format) pr_info("maple_tree(" PTR_FMT ") flags %X, height %u root " PTR_FMT "\n", mt, mt->ma_flags, mt_height(mt), entry); - if (!xa_is_node(entry)) - mt_dump_entry(entry, 0, 0, 0, format); - else if (entry) + if (xa_is_node(entry)) mt_dump_node(mt, entry, 0, mt_node_max(entry), 0, format); + else if (entry) + mt_dump_entry(entry, 0, 0, 0, format); + else + pr_info("(empty)\n"); } EXPORT_SYMBOL_GPL(mt_dump); From cefbcf206f6d92dc0076e3fda06e2b9331b77868 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:24 +0000 Subject: [PATCH 186/215] maple_tree: the return value of mas_root_expand() is not used No user of the return value now, just remove it. Link: https://lkml.kernel.org/r/20241031231627.14316-3-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 523355fb2bbe2..071d3055f1fa5 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3408,7 +3408,7 @@ static noinline_for_kasan void mas_commit_b_node(struct ma_wr_state *wr_mas, * @mas: The maple state * @entry: The entry to store into the tree */ -static inline int mas_root_expand(struct ma_state *mas, void *entry) +static inline void mas_root_expand(struct ma_state *mas, void *entry) { void *contents = mas_root_locked(mas); enum maple_type type = maple_leaf_64; @@ -3444,7 +3444,7 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry) ma_set_meta(node, maple_leaf_64, 0, slot); /* swap the new root into the tree */ rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node)); - return slot; + return; } static inline void mas_store_root(struct ma_state *mas, void *entry) From 8c836f1712d750163fb00b6cc3a730149c215979 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:25 +0000 Subject: [PATCH 187/215] maple_tree: not necessary to check index/last again Before calling mas_new_root(), the range has been checked. Link: https://lkml.kernel.org/r/20241031231627.14316-4-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/maple_tree.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 071d3055f1fa5..4900f182e99d7 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3670,7 +3670,9 @@ static inline void mas_new_root(struct ma_state *mas, void *entry) void __rcu **slots; unsigned long *pivots; - if (!entry && !mas->index && mas->last == ULONG_MAX) { + WARN_ON_ONCE(mas->index || mas->last != ULONG_MAX); + + if (!entry) { mas->depth = 0; mas_set_height(mas); rcu_assign_pointer(mas->tree->ma_root, entry); From 0ea120b278ad7f7cfeeb606e150ad04b192df60b Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:26 +0000 Subject: [PATCH 188/215] maple_tree: refine mas_store_root() on storing NULL Currently, when storing NULL on mas_store_root(), the behavior could be improved. Storing NULLs over the entire tree may result in a node being used to store a single range. Further stores of NULL may cause the node and tree to be corrupt and cause incorrect behaviour. Fixing the store to the root null fixes the issue by ensuring that a range of 0 - ULONG_MAX results in an empty tree. Users of the tree may experience incorrect values returned if the tree was expanded to store values, then overwritten by all NULLS, then continued to store NULLs over the empty area. For example possible cases are: * store NULL at any range result a new node * store NULL at range [m, n] where m > 0 to a single entry tree result a new node with range [m, n] set to NULL * store NULL at range [m, n] where m > 0 to an empty tree result consecutive NULL slot * it allows for multiple NULL entries by expanding root to store NULLs to an empty tree This patch tries to improve in: * memory efficient by setting to empty tree instead of using a node * remove the possibility of consecutive NULL slot which will prohibit extended null in later operation Link: https://lkml.kernel.org/r/20241031231627.14316-5-richard.weiyang@gmail.com Fixes: 54a611b60590 ("Maple Tree: add new data structure") Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Cc: Signed-off-by: Andrew Morton --- lib/maple_tree.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/lib/maple_tree.c b/lib/maple_tree.c index 4900f182e99d7..d0ae808f3a149 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -3447,9 +3447,20 @@ static inline void mas_root_expand(struct ma_state *mas, void *entry) return; } +/* + * mas_store_root() - Storing value into root. + * @mas: The maple state + * @entry: The entry to store. + * + * There is no root node now and we are storing a value into the root - this + * function either assigns the pointer or expands into a node. + */ static inline void mas_store_root(struct ma_state *mas, void *entry) { - if (likely((mas->last != 0) || (mas->index != 0))) + if (!entry) { + if (!mas->index) + rcu_assign_pointer(mas->tree->ma_root, NULL); + } else if (likely((mas->last != 0) || (mas->index != 0))) mas_root_expand(mas, entry); else if (((unsigned long) (entry) & 3) == 2) mas_root_expand(mas, entry); From 431e10601913f8f2006f3ed607e73eedf264b426 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 31 Oct 2024 23:16:27 +0000 Subject: [PATCH 189/215] maple_tree: add a test checking storing null Add a test to assert that, when storing null to am empty tree or a single entry tree it will not result into: * a root node with range [0, ULONG_MAX] set to NULL * a root node with consecutive slot set to NULL [akpm@linux-foundation.org: work around build error (mas_root)] Link: https://lkml.kernel.org/r/20241031231627.14316-6-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Liam R. Howlett Cc: Liam R. Howlett Cc: Sidhartha Kumar Cc: Lorenzo Stoakes Signed-off-by: Andrew Morton --- lib/test_maple_tree.c | 90 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 90 insertions(+) diff --git a/lib/test_maple_tree.c b/lib/test_maple_tree.c index 31561e0e1a0d4..704cb1093ae8f 100644 --- a/lib/test_maple_tree.c +++ b/lib/test_maple_tree.c @@ -1387,6 +1387,92 @@ static noinline void __init check_prev_entry(struct maple_tree *mt) mas_unlock(&mas); } +static noinline void __init check_store_null(struct maple_tree *mt) +{ + MA_STATE(mas, mt, 0, ULONG_MAX); + + /* + * Store NULL at range [0, ULONG_MAX] to an empty tree should result + * in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at any range to an empty tree should result in an empty + * tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set_range(&mas, 3, 10); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, ULONG_MAX] to a single entry tree should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 0, ULONG_MAX); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, n] to a single entry tree should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 0, 5); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [m, n] where m > 0 to a single entry tree + * should still be a single entry tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set(&mas, 0); + mas_store_gfp(&mas, &mas, GFP_KERNEL); + mas_set_range(&mas, 2, 5); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, mtree_empty(mt)); +// MT_BUG_ON(mt, xa_is_node(mas_root(&mas))); + mas_unlock(&mas); + mtree_destroy(mt); + + /* + * Store NULL at range [0, ULONG_MAX] to a tree with node should + * result in an empty tree + */ + mt_init_flags(mt, MT_FLAGS_ALLOC_RANGE); + mas_lock(&mas); + mas_set_range(&mas, 1, 3); + mas_store_gfp(&mas, &mas, GFP_KERNEL); +// MT_BUG_ON(mt, !xa_is_node(mas_root(&mas))); + mas_set_range(&mas, 0, ULONG_MAX); + mas_store_gfp(&mas, NULL, GFP_KERNEL); + MT_BUG_ON(mt, !mtree_empty(mt)); + mas_unlock(&mas); + mtree_destroy(mt); +} + static noinline void __init check_root_expand(struct maple_tree *mt) { MA_STATE(mas, mt, 0, 0); @@ -3710,6 +3796,10 @@ static int __init maple_tree_seed(void) goto skip; #endif + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); + check_store_null(&tree); + mtree_destroy(&tree); + mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE); check_root_expand(&tree); mtree_destroy(&tree); From e3d37a6f62953962102607fe4491129271510990 Mon Sep 17 00:00:00 2001 From: Marc Dionne Date: Thu, 31 Oct 2024 07:55:34 -0300 Subject: [PATCH 190/215] tools/mm: fix slabinfo crash when MAX_SLABS is exceeded The number of slabs can easily exceed the hard coded MAX_SLABS in the slabinfo tool, causing it to overwrite memory and crash. Increase the value of MAX_SLABS, and check if that has been exceeded for each new slab, instead of at the end when it's already too late. Also move the check for MAX_ALIASES into the loop body. Link: https://lkml.kernel.org/r/20241031105534.565533-1-marc.c.dionne@gmail.com Signed-off-by: Marc Dionne Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/mm/slabinfo.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tools/mm/slabinfo.c b/tools/mm/slabinfo.c index 04e9e6ba86ead..1433eff99feb0 100644 --- a/tools/mm/slabinfo.c +++ b/tools/mm/slabinfo.c @@ -21,7 +21,7 @@ #include #include -#define MAX_SLABS 500 +#define MAX_SLABS 2000 #define MAX_ALIASES 500 #define MAX_NODES 1024 @@ -1228,6 +1228,8 @@ static void read_slab_dir(void) continue; switch (de->d_type) { case DT_LNK: + if (alias - aliasinfo == MAX_ALIASES) + fatal("Too many aliases\n"); alias->name = strdup(de->d_name); count = readlink(de->d_name, buffer, sizeof(buffer)-1); @@ -1242,6 +1244,8 @@ static void read_slab_dir(void) alias++; break; case DT_DIR: + if (slab - slabinfo == MAX_SLABS) + fatal("Too many slabs\n"); if (chdir(de->d_name)) fatal("Unable to access slab %s\n", slab->name); slab->name = strdup(de->d_name); @@ -1312,10 +1316,6 @@ static void read_slab_dir(void) slabs = slab - slabinfo; actual_slabs = slabs; aliases = alias - aliasinfo; - if (slabs > MAX_SLABS) - fatal("Too many slabs\n"); - if (aliases > MAX_ALIASES) - fatal("Too many aliases\n"); } static void output_slabs(void) From 949042811117d2f437ef6b529a69d45e2ee2d429 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 1 Nov 2024 13:54:06 -0300 Subject: [PATCH 191/215] mm: shmem: control THP support through the kernel command line MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "mm: add more kernel parameters to control mTHP", v5. This series introduces four patches related to the kernel parameters controlling mTHP and a fifth patch replacing `strcpy()` for `strscpy()` in the file `mm/huge_memory.c`. The first patch is a straightforward documentation update, correcting the format of the kernel parameter ``thp_anon=``. The second, third, and fourth patches focus on controlling THP support for shmem via the kernel command line. The second patch introduces a parameter to control the global default huge page allocation policy for the internal shmem mount. The third patch moves a piece of code to a shared header to ease the implementation of the fourth patch. Finally, the fourth patch implements a parameter similar to ``thp_anon=``, but for shmem. The goal of these changes is to simplify the configuration of systems that rely on mTHP support for shmem. For instance, a platform with a GPU that benefits from huge pages may want to enable huge pages for shmem. Having these kernel parameters streamlines the configuration process and ensures consistency across setups. This patch (of 4): Add a new kernel command line to control the hugepage allocation policy for the internal shmem mount, ``transparent_hugepage_shmem``. The parameter is similar to ``transparent_hugepage`` and has the following format: transparent_hugepage_shmem= where ```` is one of the seven valid policies available for shmem. Configuring the default huge page allocation policy for the internal shmem mount can be beneficial for DRM GPU drivers. Just as CPU architectures, GPUs can also take advantage of huge pages, but this is possible only if DRM GEM objects are backed by huge pages. Since GEM uses shmem to allocate anonymous pageable memory, having control over the default huge page allocation policy allows for the exploration of huge pages use on GPUs that rely on GEM objects backed by shmem. Link: https://lkml.kernel.org/r/20241101165719.1074234-2-mcanal@igalia.com Link: https://lkml.kernel.org/r/20241101165719.1074234-4-mcanal@igalia.com Signed-off-by: Maíra Canal Reviewed-by: Baolin Wang Acked-by: David Hildenbrand Cc: Barry Song Cc: dri-devel@lists.freedesktop.org Cc: Hugh Dickins Cc: Jonathan Corbet Cc: kernel-dev@igalia.com Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 7 ++ Documentation/admin-guide/mm/transhuge.rst | 6 ++ mm/shmem.c | 72 +++++++++++++------ 3 files changed, 62 insertions(+), 23 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 1666576acc0eb..acabb04d0dd48 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6926,6 +6926,13 @@ See Documentation/admin-guide/mm/transhuge.rst for more details. + transparent_hugepage_shmem= [KNL] + Format: [always|within_size|advise|never|deny|force] + Can be used to control the hugepage allocation policy for + the internal shmem mount. + See Documentation/admin-guide/mm/transhuge.rst + for more details. + trusted.source= [KEYS] Format: This parameter identifies the trust source as a backend diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index abdf10a1c7db5..9c6f6da612c4f 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -326,6 +326,12 @@ PMD_ORDER THP policy will be overridden. If the policy for PMD_ORDER is not defined within a valid ``thp_anon``, its policy will default to ``never``. +Similarly to ``transparent_hugepage``, you can control the hugepage +allocation policy for the internal shmem mount by using the kernel parameter +``transparent_hugepage_shmem=``, where ```` is one of the +seven valid policies for shmem (``always``, ``within_size``, ``advise``, +``never``, ``deny``, and ``force``). + Hugepages in tmpfs/shmem ======================== diff --git a/mm/shmem.c b/mm/shmem.c index 5afc5b1f7ae14..c83675c271909 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -582,24 +582,39 @@ static bool shmem_huge_global_enabled(struct inode *inode, pgoff_t index, } } -#if defined(CONFIG_SYSFS) static int shmem_parse_huge(const char *str) { + int huge; + + if (!str) + return -EINVAL; + if (!strcmp(str, "never")) - return SHMEM_HUGE_NEVER; - if (!strcmp(str, "always")) - return SHMEM_HUGE_ALWAYS; - if (!strcmp(str, "within_size")) - return SHMEM_HUGE_WITHIN_SIZE; - if (!strcmp(str, "advise")) - return SHMEM_HUGE_ADVISE; - if (!strcmp(str, "deny")) - return SHMEM_HUGE_DENY; - if (!strcmp(str, "force")) - return SHMEM_HUGE_FORCE; - return -EINVAL; + huge = SHMEM_HUGE_NEVER; + else if (!strcmp(str, "always")) + huge = SHMEM_HUGE_ALWAYS; + else if (!strcmp(str, "within_size")) + huge = SHMEM_HUGE_WITHIN_SIZE; + else if (!strcmp(str, "advise")) + huge = SHMEM_HUGE_ADVISE; + else if (!strcmp(str, "deny")) + huge = SHMEM_HUGE_DENY; + else if (!strcmp(str, "force")) + huge = SHMEM_HUGE_FORCE; + else + return -EINVAL; + + if (!has_transparent_hugepage() && + huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) + return -EINVAL; + + /* Do not override huge allocation policy with non-PMD sized mTHP */ + if (huge == SHMEM_HUGE_FORCE && + huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) + return -EINVAL; + + return huge; } -#endif #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static const char *shmem_format_huge(int huge) @@ -5065,15 +5080,7 @@ static ssize_t shmem_enabled_store(struct kobject *kobj, huge = shmem_parse_huge(tmp); if (huge == -EINVAL) - return -EINVAL; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER && huge != SHMEM_HUGE_DENY) - return -EINVAL; - - /* Do not override huge allocation policy with non-PMD sized mTHP */ - if (huge == SHMEM_HUGE_FORCE && - huge_shmem_orders_inherit != BIT(HPAGE_PMD_ORDER)) - return -EINVAL; + return huge; shmem_huge = huge; if (shmem_huge > SHMEM_HUGE_DENY) @@ -5170,6 +5177,25 @@ struct kobj_attribute thpsize_shmem_enabled_attr = __ATTR(shmem_enabled, 0644, thpsize_shmem_enabled_show, thpsize_shmem_enabled_store); #endif /* CONFIG_TRANSPARENT_HUGEPAGE && CONFIG_SYSFS */ +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) + +static int __init setup_transparent_hugepage_shmem(char *str) +{ + int huge; + + huge = shmem_parse_huge(str); + if (huge == -EINVAL) { + pr_warn("transparent_hugepage_shmem= cannot parse, ignored\n"); + return huge; + } + + shmem_huge = huge; + return 1; +} +__setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); + +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + #else /* !CONFIG_SHMEM */ /* From 1c8d48497525d77acfb7bdaaa246a887e754f379 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 1 Nov 2024 13:54:07 -0300 Subject: [PATCH 192/215] mm: move ``get_order_from_str()`` to internal.h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In order to implement a kernel parameter similar to ``thp_anon=`` for shmem, we'll need the function ``get_order_from_str()``. Instead of duplicating the function, move the function to a shared header, in which both mm/shmem.c and mm/huge_memory.c will be able to use it. Link: https://lkml.kernel.org/r/20241101165719.1074234-5-mcanal@igalia.com Signed-off-by: Maíra Canal Reviewed-by: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 38 +++++++++++++++----------------------- mm/internal.h | 22 ++++++++++++++++++++++ 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f920688644692..a6edbd8c4f495 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -958,26 +958,6 @@ static int __init setup_transparent_hugepage(char *str) } __setup("transparent_hugepage=", setup_transparent_hugepage); -static inline int get_order_from_str(const char *size_str) -{ - unsigned long size; - char *endptr; - int order; - - size = memparse(size_str, &endptr); - - if (!is_power_of_2(size)) - goto err; - order = get_order(size); - if (BIT(order) & ~THP_ORDERS_ALL_ANON) - goto err; - - return order; -err: - pr_err("invalid size %s in thp_anon boot parameter\n", size_str); - return -EINVAL; -} - static char str_dup[PAGE_SIZE] __initdata; static int __init setup_thp_anon(char *str) { @@ -1007,10 +987,22 @@ static int __init setup_thp_anon(char *str) start_size = strsep(&subtoken, "-"); end_size = subtoken; - start = get_order_from_str(start_size); - end = get_order_from_str(end_size); + start = get_order_from_str(start_size, THP_ORDERS_ALL_ANON); + end = get_order_from_str(end_size, THP_ORDERS_ALL_ANON); } else { - start = end = get_order_from_str(subtoken); + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_ANON); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_anon boot parameter\n", start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_anon boot parameter\n", end_size); + goto err; } if (start < 0 || end < 0 || start > end) diff --git a/mm/internal.h b/mm/internal.h index d5b93c5b63648..5a7302baeed7c 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -1291,6 +1291,28 @@ static inline bool alloc_zeroed(void) &init_on_alloc); } +/* + * Parses a string with mem suffixes into its order. Useful to parse kernel + * parameters. + */ +static inline int get_order_from_str(const char *size_str, + unsigned long valid_orders) +{ + unsigned long size; + char *endptr; + int order; + + size = memparse(size_str, &endptr); + + if (!is_power_of_2(size)) + return -EINVAL; + order = get_order(size); + if (BIT(order) & ~valid_orders) + return -EINVAL; + + return order; +} + enum { /* mark page accessed */ FOLL_TOUCH = 1 << 16, From 24f9cd195fbc9382ae0ed8b332e6302d1722d8e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 1 Nov 2024 13:54:08 -0300 Subject: [PATCH 193/215] mm: shmem: override mTHP shmem default with a kernel parameter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add the ``thp_shmem=`` kernel command line to allow specifying the default policy of each supported shmem hugepage size. The kernel parameter accepts the following format: thp_shmem=[KMG],[KMG]:;[KMG]-[KMG]: For example, thp_shmem=16K-64K:always;128K,512K:inherit;256K:advise;1M-2M:never;4M-8M:within_size Some GPUs may benefit from using huge pages. Since DRM GEM uses shmem to allocate anonymous pageable memory, it's essential to control the huge page allocation policy for the internal shmem mount. This control can be achieved through the ``transparent_hugepage_shmem=`` parameter. Beyond just setting the allocation policy, it's crucial to have granular control over the size of huge pages that can be allocated. The GPU may support only specific huge page sizes, and allocating pages larger/smaller than those sizes would be ineffective. Link: https://lkml.kernel.org/r/20241101165719.1074234-6-mcanal@igalia.com Signed-off-by: Maíra Canal Reviewed-by: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Lance Yang Cc: Ryan Roberts Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 10 ++ Documentation/admin-guide/mm/transhuge.rst | 17 +++ mm/shmem.c | 105 +++++++++++++++++- 3 files changed, 131 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index acabb04d0dd48..b48d744d99b05 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6700,6 +6700,16 @@ Force threading of all interrupt handlers except those marked explicitly IRQF_NO_THREAD. + thp_shmem= [KNL] + Format: [KMG],[KMG]:;[KMG]-[KMG]: + Control the default policy of each hugepage size for the + internal shmem mount. is one of policies available + for the shmem mount ("always", "inherit", "never", "within_size", + and "advise"). + It can be used multiple times for multiple shmem THP sizes. + See Documentation/admin-guide/mm/transhuge.rst for more + details. + topology= [S390,EARLY] Format: {off | on} Specify if the kernel should make use of the cpu diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 9c6f6da612c4f..5034915f4e8e8 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -332,6 +332,23 @@ allocation policy for the internal shmem mount by using the kernel parameter seven valid policies for shmem (``always``, ``within_size``, ``advise``, ``never``, ``deny``, and ``force``). +In the same manner as ``thp_anon`` controls each supported anonymous THP +size, ``thp_shmem`` controls each supported shmem THP size. ``thp_shmem`` +has the same format as ``thp_anon``, but also supports the policy +``within_size``. + +``thp_shmem=`` may be specified multiple times to configure all THP sizes +as required. If ``thp_shmem=`` is specified at least once, any shmem THP +sizes not explicitly configured on the command line are implicitly set to +``never``. + +``transparent_hugepage_shmem`` setting only affects the global toggle. If +``thp_shmem`` is not specified, PMD_ORDER hugepage will default to +``inherit``. However, if a valid ``thp_shmem`` setting is provided by the +user, the PMD_ORDER hugepage policy will be overridden. If the policy for +PMD_ORDER is not defined within a valid ``thp_shmem``, its policy will +default to ``never``. + Hugepages in tmpfs/shmem ======================== diff --git a/mm/shmem.c b/mm/shmem.c index c83675c271909..579e58cb3262e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -136,6 +136,7 @@ static unsigned long huge_shmem_orders_always __read_mostly; static unsigned long huge_shmem_orders_madvise __read_mostly; static unsigned long huge_shmem_orders_inherit __read_mostly; static unsigned long huge_shmem_orders_within_size __read_mostly; +static bool shmem_orders_configured __initdata; #endif #ifdef CONFIG_TMPFS @@ -5026,7 +5027,8 @@ void __init shmem_init(void) * Default to setting PMD-sized THP to inherit the global setting and * disable all other multi-size THPs. */ - huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); + if (!shmem_orders_configured) + huge_shmem_orders_inherit = BIT(HPAGE_PMD_ORDER); #endif return; @@ -5194,6 +5196,107 @@ static int __init setup_transparent_hugepage_shmem(char *str) } __setup("transparent_hugepage_shmem=", setup_transparent_hugepage_shmem); +static char str_dup[PAGE_SIZE] __initdata; +static int __init setup_thp_shmem(char *str) +{ + char *token, *range, *policy, *subtoken; + unsigned long always, inherit, madvise, within_size; + char *start_size, *end_size; + int start, end, nr; + char *p; + + if (!str || strlen(str) + 1 > PAGE_SIZE) + goto err; + strscpy(str_dup, str); + + always = huge_shmem_orders_always; + inherit = huge_shmem_orders_inherit; + madvise = huge_shmem_orders_madvise; + within_size = huge_shmem_orders_within_size; + p = str_dup; + while ((token = strsep(&p, ";")) != NULL) { + range = strsep(&token, ":"); + policy = token; + + if (!policy) + goto err; + + while ((subtoken = strsep(&range, ",")) != NULL) { + if (strchr(subtoken, '-')) { + start_size = strsep(&subtoken, "-"); + end_size = subtoken; + + start = get_order_from_str(start_size, + THP_ORDERS_ALL_FILE_DEFAULT); + end = get_order_from_str(end_size, + THP_ORDERS_ALL_FILE_DEFAULT); + } else { + start_size = end_size = subtoken; + start = end = get_order_from_str(subtoken, + THP_ORDERS_ALL_FILE_DEFAULT); + } + + if (start == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + start_size); + goto err; + } + + if (end == -EINVAL) { + pr_err("invalid size %s in thp_shmem boot parameter\n", + end_size); + goto err; + } + + if (start < 0 || end < 0 || start > end) + goto err; + + nr = end - start + 1; + if (!strcmp(policy, "always")) { + bitmap_set(&always, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "advise")) { + bitmap_set(&madvise, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "inherit")) { + bitmap_set(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else if (!strcmp(policy, "within_size")) { + bitmap_set(&within_size, start, nr); + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + } else if (!strcmp(policy, "never")) { + bitmap_clear(&inherit, start, nr); + bitmap_clear(&madvise, start, nr); + bitmap_clear(&always, start, nr); + bitmap_clear(&within_size, start, nr); + } else { + pr_err("invalid policy %s in thp_shmem boot parameter\n", policy); + goto err; + } + } + } + + huge_shmem_orders_always = always; + huge_shmem_orders_madvise = madvise; + huge_shmem_orders_inherit = inherit; + huge_shmem_orders_within_size = within_size; + shmem_orders_configured = true; + return 1; + +err: + pr_warn("thp_shmem=%s: error parsing string, ignoring setting\n", str); + return 0; +} +__setup("thp_shmem=", setup_thp_shmem); + #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #else /* !CONFIG_SHMEM */ From 93c1e57adeb0aa7d3feedeb82ac19845cbe540de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Fri, 1 Nov 2024 13:54:09 -0300 Subject: [PATCH 194/215] mm: huge_memory: use strscpy() instead of strcpy() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace strcpy() with strscpy() in mm/huge_memory.c strcpy() has been deprecated because it is generally unsafe, so help to eliminate it from the kernel source. Link: https://github.com/KSPP/linux/issues/88 Link: https://lkml.kernel.org/r/20241101165719.1074234-7-mcanal@igalia.com Signed-off-by: Maíra Canal Reviewed-by: Lance Yang Cc: Baolin Wang Cc: Barry Song Cc: David Hildenbrand Cc: Hugh Dickins Cc: Jonathan Corbet Cc: Ryan Roberts Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a6edbd8c4f495..1ebe18ec45607 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -969,7 +969,7 @@ static int __init setup_thp_anon(char *str) if (!str || strlen(str) + 1 > PAGE_SIZE) goto err; - strcpy(str_dup, str); + strscpy(str_dup, str); always = huge_anon_orders_always; madvise = huge_anon_orders_madvise; @@ -4167,7 +4167,7 @@ static ssize_t split_huge_pages_write(struct file *file, const char __user *buf, tok = strsep(&buf, ","); if (tok) { - strcpy(file_path, tok); + strscpy(file_path, tok); } else { ret = -EINVAL; goto out; From ad2bc8812fc17c8536d5e37aa0754463b76b66a4 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Fri, 1 Nov 2024 18:46:27 +0000 Subject: [PATCH 195/215] mm: remove unnecessary page_table_lock on stack expansion Ever since commit 8d7071af8907 ("mm: always expand the stack with the mmap write lock held") we have been expanding the stack with the mmap write lock held. This is true in all code paths: get_arg_page() -> expand_downwards() setup_arg_pages() -> expand_stack_locked() -> expand_downwards() / expand_upwards() lock_mm_and_find_vma() -> expand_stack_locked() -> expand_downwards() / expand_upwards() create_elf_tables() -> find_extend_vma_locked() -> expand_stack_locked() expand_stack() -> vma_expand_down() -> expand_downwards() expand_stack() -> vma_expand_up() -> expand_upwards() Each of which acquire the mmap write lock before doing so. Despite this, we maintain code that acquires a page table lock in the expand_upwards() and expand_downwards() code, stating that we hold a shared mmap lock and thus this is necessary. It is not, we do not have to worry about concurrent VMA expansions so we can simply drop this, and update comments accordingly. We do not even need be concerned with racing page faults, as vma_start_write() is invoked in both cases. Link: https://lkml.kernel.org/r/20241101184627.131391-1-lorenzo.stoakes@oracle.com Signed-off-by: Lorenzo Stoakes Acked-by: Linus Torvalds Reviewed-by: Jann Horn Acked-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Signed-off-by: Andrew Morton --- mm/mmap.c | 38 ++++++-------------------------------- 1 file changed, 6 insertions(+), 32 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f904b3bba9627..386429f7db5a0 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1039,6 +1039,8 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; + mmap_assert_write_locked(mm); + /* Guard against exceeding limits of the address space. */ address &= PAGE_MASK; if (address >= (TASK_SIZE & PAGE_MASK)) @@ -1074,11 +1076,7 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_lock in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ + /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ @@ -1092,16 +1090,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (vma->vm_pgoff + (size >> PAGE_SHIFT) >= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - /* - * We only hold a shared mmap_lock lock here, so - * we need to protect against concurrent vma - * expansions. anon_vma_lock_write() doesn't - * help here, as we don't guarantee that all - * growable vmas in a mm share the same root - * anon vma. So, we reuse mm->page_table_lock - * to guard against concurrent vma expansions. - */ - spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); @@ -1110,7 +1098,6 @@ static int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); - spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } @@ -1137,6 +1124,8 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (!(vma->vm_flags & VM_GROWSDOWN)) return -EFAULT; + mmap_assert_write_locked(mm); + address &= PAGE_MASK; if (address < mmap_min_addr || address < FIRST_USER_ADDRESS) return -EPERM; @@ -1166,11 +1155,7 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) /* Lock the VMA before expanding to prevent concurrent page faults */ vma_start_write(vma); - /* - * vma->vm_start/vm_end cannot change under us because the caller - * is required to hold the mmap_lock in read mode. We need the - * anon_vma lock to serialize against concurrent expand_stacks. - */ + /* We update the anon VMA tree. */ anon_vma_lock_write(vma->anon_vma); /* Somebody else might have raced and expanded it already */ @@ -1184,16 +1169,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) if (grow <= vma->vm_pgoff) { error = acct_stack_growth(vma, size, grow); if (!error) { - /* - * We only hold a shared mmap_lock lock here, so - * we need to protect against concurrent vma - * expansions. anon_vma_lock_write() doesn't - * help here, as we don't guarantee that all - * growable vmas in a mm share the same root - * anon vma. So, we reuse mm->page_table_lock - * to guard against concurrent vma expansions. - */ - spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, grow); @@ -1203,7 +1178,6 @@ int expand_downwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_iter_store(&vmi, vma); anon_vma_interval_tree_post_update_vma(vma); - spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); } From c28432acf61751c2be8b36cb831dd490d2aed465 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Fri, 1 Nov 2024 23:40:10 +0500 Subject: [PATCH 196/215] kasan: use EXPORT_SYMBOL_IF_KUNIT to export symbols Patch series "kasan: few improvements on kunit tests". This patch series addresses the issue [1] with KASAN symbols used in the Kunit test, but exported as EXPORT_SYMBOL_GPL. Also a small tweak of marking kasan_atomics() as KUNIT_CASE_SLOW to avoid kunit report that the test should be marked as slow. This patch (of 2): Replace EXPORT_SYMBOL_GPL with EXPORT_SYMBOL_IF_KUNIT to mark the symbols as visible only if CONFIG_KUNIT is enabled. KASAN Kunit test should import the namespace EXPORTED_FOR_KUNIT_TESTING to use these marked symbols. Link: https://lkml.kernel.org/r/20241101184011.3369247-1-snovitoll@gmail.com Link: https://lkml.kernel.org/r/20241101184011.3369247-2-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reported-by: Andrey Konovalov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=218315 Reviewed-by: Andrey Konovalov Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/hw_tags.c | 7 ++++--- mm/kasan/kasan_test_c.c | 2 ++ mm/kasan/report.c | 17 +++++++++-------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9958ebc15d383..ccd66c7a4081d 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -8,6 +8,7 @@ #define pr_fmt(fmt) "kasan: " fmt +#include #include #include #include @@ -394,12 +395,12 @@ void kasan_enable_hw_tags(void) #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -EXPORT_SYMBOL_GPL(kasan_enable_hw_tags); +EXPORT_SYMBOL_IF_KUNIT(kasan_enable_hw_tags); -void kasan_force_async_fault(void) +VISIBLE_IF_KUNIT void kasan_force_async_fault(void) { hw_force_async_tag_fault(); } -EXPORT_SYMBOL_GPL(kasan_force_async_fault); +EXPORT_SYMBOL_IF_KUNIT(kasan_force_async_fault); #endif diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index fd5058c5d0f75..3daa69a52cbff 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -33,6 +33,8 @@ #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) +MODULE_IMPORT_NS(EXPORTED_FOR_KUNIT_TESTING); + static bool multishot; /* Fields set based on lines observed in the console. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 3e48668c3e40a..50fb19ad43881 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -10,6 +10,7 @@ */ #include +#include #include #include #include @@ -134,18 +135,18 @@ static bool report_enabled(void) #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) -bool kasan_save_enable_multi_shot(void) +VISIBLE_IF_KUNIT bool kasan_save_enable_multi_shot(void) { return test_and_set_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } -EXPORT_SYMBOL_GPL(kasan_save_enable_multi_shot); +EXPORT_SYMBOL_IF_KUNIT(kasan_save_enable_multi_shot); -void kasan_restore_multi_shot(bool enabled) +VISIBLE_IF_KUNIT void kasan_restore_multi_shot(bool enabled) { if (!enabled) clear_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags); } -EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); +EXPORT_SYMBOL_IF_KUNIT(kasan_restore_multi_shot); #endif @@ -157,17 +158,17 @@ EXPORT_SYMBOL_GPL(kasan_restore_multi_shot); */ static bool kasan_kunit_executing; -void kasan_kunit_test_suite_start(void) +VISIBLE_IF_KUNIT void kasan_kunit_test_suite_start(void) { WRITE_ONCE(kasan_kunit_executing, true); } -EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_start); +EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_start); -void kasan_kunit_test_suite_end(void) +VISIBLE_IF_KUNIT void kasan_kunit_test_suite_end(void) { WRITE_ONCE(kasan_kunit_executing, false); } -EXPORT_SYMBOL_GPL(kasan_kunit_test_suite_end); +EXPORT_SYMBOL_IF_KUNIT(kasan_kunit_test_suite_end); static bool kasan_kunit_test_suite_executing(void) { From 1857099c18e16a72bb7d0a84afb323663d49ee00 Mon Sep 17 00:00:00 2001 From: Sabyrzhan Tasbolatov Date: Fri, 1 Nov 2024 23:40:11 +0500 Subject: [PATCH 197/215] kasan: change kasan_atomics kunit test as KUNIT_CASE_SLOW During running KASAN Kunit tests with CONFIG_KASAN enabled, the following "warning" is reported by kunit framework: # kasan_atomics: Test should be marked slow (runtime: 2.604703115s) It took 2.6 seconds on my PC (Intel(R) Core(TM) i7-7700K CPU @ 4.20GHz), apparently, due to multiple atomic checks in kasan_atomics_helper(). Let's mark it with KUNIT_CASE_SLOW which reports now as: # kasan_atomics.speed: slow Link: https://lkml.kernel.org/r/20241101184011.3369247-3-snovitoll@gmail.com Signed-off-by: Sabyrzhan Tasbolatov Reviewed-by: Andrey Konovalov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Cc: Vincenzo Frascino Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 3daa69a52cbff..000b996ff695b 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -2075,7 +2075,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kasan_strings), KUNIT_CASE(kasan_bitops_generic), KUNIT_CASE(kasan_bitops_tags), - KUNIT_CASE(kasan_atomics), + KUNIT_CASE_SLOW(kasan_atomics), KUNIT_CASE(vmalloc_helpers_tags), KUNIT_CASE(vmalloc_oob), KUNIT_CASE(vmap_tags), From 3738290bfc99606787f515a4590ad38dc4f79ca4 Mon Sep 17 00:00:00 2001 From: Nihar Chaithanya Date: Tue, 15 Oct 2024 00:31:30 +0530 Subject: [PATCH 198/215] kasan: add kunit tests for kmalloc_track_caller, kmalloc_node_track_caller The Kunit tests for kmalloc_track_caller and kmalloc_node_track_caller were missing in kasan_test_c.c, which check that these functions poison the memory properly. Add a Kunit test: -> kmalloc_tracker_caller_oob_right(): This includes out-of-bounds access test for kmalloc_track_caller and kmalloc_node_track_caller. Link: https://lkml.kernel.org/r/20241014190128.442059-1-niharchaithanya@gmail.com Link: https://bugzilla.kernel.org/show_bug.cgi?id=216509 Signed-off-by: Nihar Chaithanya Reviewed-by: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/kasan/kasan_test_c.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/mm/kasan/kasan_test_c.c b/mm/kasan/kasan_test_c.c index 000b996ff695b..e0ec5a6d15be1 100644 --- a/mm/kasan/kasan_test_c.c +++ b/mm/kasan/kasan_test_c.c @@ -215,6 +215,36 @@ static void kmalloc_node_oob_right(struct kunit *test) kfree(ptr); } +static void kmalloc_track_caller_oob_right(struct kunit *test) +{ + char *ptr; + size_t size = 128 - KASAN_GRANULE_SIZE; + + /* + * Check that KASAN detects out-of-bounds access for object allocated via + * kmalloc_track_caller(). + */ + ptr = kmalloc_track_caller(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y'); + + kfree(ptr); + + /* + * Check that KASAN detects out-of-bounds access for object allocated via + * kmalloc_node_track_caller(). + */ + ptr = kmalloc_node_track_caller(size, GFP_KERNEL, 0); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr); + + OPTIMIZER_HIDE_VAR(ptr); + KUNIT_EXPECT_KASAN_FAIL(test, ptr[size] = 'y'); + + kfree(ptr); +} + /* * Check that KASAN detects an out-of-bounds access for a big object allocated * via kmalloc(). But not as big as to trigger the page_alloc fallback. @@ -2015,6 +2045,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_oob_right), KUNIT_CASE(kmalloc_oob_left), KUNIT_CASE(kmalloc_node_oob_right), + KUNIT_CASE(kmalloc_track_caller_oob_right), KUNIT_CASE(kmalloc_big_oob_right), KUNIT_CASE(kmalloc_large_oob_right), KUNIT_CASE(kmalloc_large_uaf), From 3f28bbe56c7b77e73f1dd0515cad009cfdd64962 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:52 +0800 Subject: [PATCH 199/215] mm/list_lru: don't pass unnecessary key parameters Patch series "mm/list_lru: Split list_lru lock into per-cgroup scope". When LOCKDEP is not enabled, lock_class_key is an empty struct that is never used. But the list_lru initialization function still takes a placeholder pointer as parameter, and the compiler cannot optimize it because the function is not static and exported. Remove this parameter and move it inside the list_lru struct. Only use it when LOCKDEP is enabled. Kernel builds with LOCKDEP will be slightly larger, while !LOCKDEP builds without it will be slightly smaller (the common case). Link: https://lkml.kernel.org/r/20241104175257.60853-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20241104175257.60853-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Waiman Long Signed-off-by: Andrew Morton --- include/linux/list_lru.h | 18 +++++++++++++++--- mm/list_lru.c | 9 +++++---- mm/workingset.c | 4 ++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 5099a8ccd5f4c..eba93f6511f31 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -56,16 +56,28 @@ struct list_lru { bool memcg_aware; struct xarray xa; #endif +#ifdef CONFIG_LOCKDEP + struct lock_class_key *key; +#endif }; void list_lru_destroy(struct list_lru *lru); int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key, struct shrinker *shrinker); + struct shrinker *shrinker); #define list_lru_init(lru) \ - __list_lru_init((lru), false, NULL, NULL) + __list_lru_init((lru), false, NULL) #define list_lru_init_memcg(lru, shrinker) \ - __list_lru_init((lru), true, NULL, shrinker) + __list_lru_init((lru), true, shrinker) + +static inline int list_lru_init_memcg_key(struct list_lru *lru, struct shrinker *shrinker, + struct lock_class_key *key) +{ +#ifdef CONFIG_LOCKDEP + lru->key = key; +#endif + return list_lru_init_memcg(lru, shrinker); +} int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp); diff --git a/mm/list_lru.c b/mm/list_lru.c index 9b7ff06e9d326..ea7dc9fa4d05a 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -562,8 +562,7 @@ static void memcg_destroy_list_lru(struct list_lru *lru) } #endif /* CONFIG_MEMCG */ -int __list_lru_init(struct list_lru *lru, bool memcg_aware, - struct lock_class_key *key, struct shrinker *shrinker) +int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shrinker) { int i; @@ -583,8 +582,10 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, for_each_node(i) { spin_lock_init(&lru->node[i].lock); - if (key) - lockdep_set_class(&lru->node[i].lock, key); +#ifdef CONFIG_LOCKDEP + if (lru->key) + lockdep_set_class(&lru->node[i].lock, lru->key); +#endif init_one_lru(&lru->node[i].lru); } diff --git a/mm/workingset.c b/mm/workingset.c index 0e38bec261a41..5c8861edbf171 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -813,8 +813,8 @@ static int __init workingset_init(void) if (!workingset_shadow_shrinker) goto err; - ret = __list_lru_init(&shadow_nodes, true, &shadow_nodes_key, - workingset_shadow_shrinker); + ret = list_lru_init_memcg_key(&shadow_nodes, workingset_shadow_shrinker, + &shadow_nodes_key); if (ret) goto err_list_lru; From 78c0ed09131b772f062b986a2fcca6600daa6285 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:53 +0800 Subject: [PATCH 200/215] mm/list_lru: don't export list_lru_add It's no longer used by any module, just remove it. Link: https://lkml.kernel.org/r/20241104175257.60853-3-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Waiman Long Signed-off-by: Andrew Morton --- mm/list_lru.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index ea7dc9fa4d05a..a798e7624f690 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -106,7 +106,6 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, spin_unlock(&nlru->lock); return false; } -EXPORT_SYMBOL_GPL(list_lru_add); bool list_lru_add_obj(struct list_lru *lru, struct list_head *item) { From 8d42abbfa4efe5fced63c0157d55f30347d7802c Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:54 +0800 Subject: [PATCH 201/215] mm/list_lru: code clean up for reparenting No feature change, just change of code structure and fix comment. The list lrus are not empty until memcg_reparent_list_lru_node() calls are all done, so the comments in memcg_offline_kmem were slightly inaccurate. Link: https://lkml.kernel.org/r/20241104175257.60853-4-ryncsn@gmail.com Signed-off-by: Kairui Song Reviewed-by: Muchun Song Acked-by: Shakeel Butt Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Qi Zheng Cc: Roman Gushchin Cc: Waiman Long Signed-off-by: Andrew Morton --- mm/list_lru.c | 39 +++++++++++++++++---------------------- mm/memcontrol.c | 7 ------- 2 files changed, 17 insertions(+), 29 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index a798e7624f690..b54f092d4d655 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -421,35 +421,16 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, spin_unlock_irq(&nlru->lock); } -static void memcg_reparent_list_lru(struct list_lru *lru, - int src_idx, struct mem_cgroup *dst_memcg) -{ - int i; - - for_each_node(i) - memcg_reparent_list_lru_node(lru, i, src_idx, dst_memcg); - - memcg_list_lru_free(lru, src_idx); -} - void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { struct cgroup_subsys_state *css; struct list_lru *lru; - int src_idx = memcg->kmemcg_id; + int src_idx = memcg->kmemcg_id, i; /* * Change kmemcg_id of this cgroup and all its descendants to the * parent's id, and then move all entries from this cgroup's list_lrus * to ones of the parent. - * - * After we have finished, all list_lrus corresponding to this cgroup - * are guaranteed to remain empty. So we can safely free this cgroup's - * list lrus in memcg_list_lru_free(). - * - * Changing ->kmemcg_id to the parent can prevent memcg_list_lru_alloc() - * from allocating list lrus for this cgroup after memcg_list_lru_free() - * call. */ rcu_read_lock(); css_for_each_descendant_pre(css, &memcg->css) { @@ -460,9 +441,23 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren } rcu_read_unlock(); + /* + * With kmemcg_id set to parent, holding the lock of each list_lru_node + * below can prevent list_lru_{add,del,isolate} from touching the lru, + * safe to reparent. + */ mutex_lock(&list_lrus_mutex); - list_for_each_entry(lru, &memcg_list_lrus, list) - memcg_reparent_list_lru(lru, src_idx, parent); + list_for_each_entry(lru, &memcg_list_lrus, list) { + for_each_node(i) + memcg_reparent_list_lru_node(lru, i, src_idx, parent); + + /* + * Here all list_lrus corresponding to the cgroup are guaranteed + * to remain empty, we can safely free this lru, any further + * memcg_list_lru_alloc() call will simply bail out. + */ + memcg_list_lru_free(lru, src_idx); + } mutex_unlock(&list_lrus_mutex); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6486e96528434..8b9061b58a728 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3111,13 +3111,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) parent = root_mem_cgroup; memcg_reparent_objcgs(memcg, parent); - - /* - * After we have finished memcg_reparent_objcgs(), all list_lrus - * corresponding to this cgroup are guaranteed to remain empty. - * The ordering is imposed by list_lru_node->lock taken by - * memcg_reparent_list_lrus(). - */ memcg_reparent_list_lrus(memcg, parent); } From 28e98022b31efdb8f1ba310d938cd9b97ededfe4 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:55 +0800 Subject: [PATCH 202/215] mm/list_lru: simplify reparenting and initial allocation Currently, there is a lot of code for detecting reparent racing using kmemcg_id as the synchronization flag. And an intermediate table is required to record and compare the kmemcg_id. We can simplify this by just checking the cgroup css status, skip if cgroup is being offlined. On the reparenting side, ensure no more allocation is on going and no further allocation will occur by using the XArray lock as barrier. Combined with a O(n^2) top-down walk for the allocation, we get rid of the intermediate table allocation completely. Despite being O(n^2), it should be actually faster because it's not practical to have a very deep cgroup level, and in most cases the parent cgroup should have been allocated already. This also avoided changing kmemcg_id before reparenting, making cgroups have a stable index for list_lru_memcg. After this change it's possible that a dying cgroup will see a NULL value in XArray corresponding to the kmemcg_id, because the kmemcg_id will point to an empty slot. In such case, just fallback to use its parent. As a result the code is simpler, following test also showed a very slight performance gain (12 test runs): prepare() { mkdir /tmp/test-fs modprobe brd rd_nr=1 rd_size=16777216 mkfs.xfs -f /dev/ram0 mount -t xfs /dev/ram0 /tmp/test-fs for i in $(seq 10000); do seq 8000 > "/tmp/test-fs/$i" done mkdir -p /sys/fs/cgroup/system.slice/bench/test/1 echo +memory > /sys/fs/cgroup/system.slice/bench/cgroup.subtree_control echo +memory > /sys/fs/cgroup/system.slice/bench/test/cgroup.subtree_control echo +memory > /sys/fs/cgroup/system.slice/bench/test/1/cgroup.subtree_control echo 768M > /sys/fs/cgroup/system.slice/bench/memory.max } do_test() { read_worker() { mkdir -p "/sys/fs/cgroup/system.slice/bench/test/1/$1" echo $BASHPID > "/sys/fs/cgroup/system.slice/bench/test/1/$1/cgroup.procs" read -r __TMP < "/tmp/test-fs/$1"; } read_in_all() { for i in $(seq 10000); do read_worker "$i" & done; wait } echo 3 > /proc/sys/vm/drop_caches time read_in_all for i in $(seq 1 10000); do rmdir "/sys/fs/cgroup/system.slice/bench/test/1/$i" &>/dev/null done } Before: real 0m3.498s user 0m11.037s sys 0m35.872s real 1m33.860s user 0m11.593s sys 3m1.169s real 1m31.883s user 0m11.265s sys 2m59.198s real 1m32.394s user 0m11.294s sys 3m1.616s real 1m31.017s user 0m11.379s sys 3m1.349s real 1m31.931s user 0m11.295s sys 2m59.863s real 1m32.758s user 0m11.254s sys 2m59.538s real 1m35.198s user 0m11.145s sys 3m1.123s real 1m30.531s user 0m11.393s sys 2m58.089s real 1m31.142s user 0m11.333s sys 3m0.549s After: real 0m3.489s user 0m10.943s sys 0m36.036s real 1m10.893s user 0m11.495s sys 2m38.545s real 1m29.129s user 0m11.382s sys 3m1.601s real 1m29.944s user 0m11.494s sys 3m1.575s real 1m31.208s user 0m11.451s sys 2m59.693s real 1m25.944s user 0m11.327s sys 2m56.394s real 1m28.599s user 0m11.312s sys 3m0.162s real 1m26.746s user 0m11.538s sys 2m55.462s real 1m30.668s user 0m11.475s sys 3m2.075s real 1m29.258s user 0m11.292s sys 3m0.780s Which is slightly faster in real time. Link: https://lkml.kernel.org/r/20241104175257.60853-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- mm/list_lru.c | 178 +++++++++++++++++++++----------------------------- mm/zswap.c | 7 +- 2 files changed, 77 insertions(+), 108 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index b54f092d4d655..172b16146e157 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -59,6 +59,20 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } return &lru->node[nid].lru; } + +static inline struct list_lru_one * +list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) +{ + struct list_lru_one *l; +again: + l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + if (likely(l)) + return l; + + memcg = parent_mem_cgroup(memcg); + VM_WARN_ON(!css_is_dying(&memcg->css)); + goto again; +} #else static void list_lru_register(struct list_lru *lru) { @@ -83,6 +97,12 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) { return &lru->node[nid].lru; } + +static inline struct list_lru_one * +list_lru_from_memcg(struct list_lru *lru, int nid, int idx) +{ + return &lru->node[nid].lru; +} #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ @@ -94,7 +114,7 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, spin_lock(&nlru->lock); if (list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + l = list_lru_from_memcg(lru, nid, memcg); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) @@ -133,7 +153,7 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, spin_lock(&nlru->lock); if (!list_empty(item)) { - l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); + l = list_lru_from_memcg(lru, nid, memcg); list_del_init(item); l->nr_items--; nlru->nr_items--; @@ -355,20 +375,6 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) return mlru; } -static void memcg_list_lru_free(struct list_lru *lru, int src_idx) -{ - struct list_lru_memcg *mlru = xa_erase_irq(&lru->xa, src_idx); - - /* - * The __list_lru_walk_one() can walk the list of this node. - * We need kvfree_rcu() here. And the walking of the list - * is under lru->node[nid]->lock, which can serve as a RCU - * read-side critical section. - */ - if (mlru) - kvfree_rcu(mlru, rcu); -} - static inline void memcg_init_list_lru(struct list_lru *lru, bool memcg_aware) { if (memcg_aware) @@ -393,22 +399,18 @@ static void memcg_destroy_list_lru(struct list_lru *lru) } static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, - int src_idx, struct mem_cgroup *dst_memcg) + struct list_lru_one *src, + struct mem_cgroup *dst_memcg) { struct list_lru_node *nlru = &lru->node[nid]; - int dst_idx = dst_memcg->kmemcg_id; - struct list_lru_one *src, *dst; + struct list_lru_one *dst; /* * Since list_lru_{add,del} may be called under an IRQ-safe lock, * we have to use IRQ-safe primitives here to avoid deadlock. */ spin_lock_irq(&nlru->lock); - - src = list_lru_from_memcg_idx(lru, nid, src_idx); - if (!src) - goto out; - dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); list_splice_init(&src->list, &dst->list); @@ -417,46 +419,43 @@ static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); src->nr_items = 0; } -out: spin_unlock_irq(&nlru->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) { - struct cgroup_subsys_state *css; struct list_lru *lru; - int src_idx = memcg->kmemcg_id, i; - - /* - * Change kmemcg_id of this cgroup and all its descendants to the - * parent's id, and then move all entries from this cgroup's list_lrus - * to ones of the parent. - */ - rcu_read_lock(); - css_for_each_descendant_pre(css, &memcg->css) { - struct mem_cgroup *child; - - child = mem_cgroup_from_css(css); - WRITE_ONCE(child->kmemcg_id, parent->kmemcg_id); - } - rcu_read_unlock(); + int i; - /* - * With kmemcg_id set to parent, holding the lock of each list_lru_node - * below can prevent list_lru_{add,del,isolate} from touching the lru, - * safe to reparent. - */ mutex_lock(&list_lrus_mutex); list_for_each_entry(lru, &memcg_list_lrus, list) { + struct list_lru_memcg *mlru; + XA_STATE(xas, &lru->xa, memcg->kmemcg_id); + + /* + * Lock the Xarray to ensure no on going list_lru_memcg + * allocation and further allocation will see css_is_dying(). + */ + xas_lock_irq(&xas); + mlru = xas_store(&xas, NULL); + xas_unlock_irq(&xas); + if (!mlru) + continue; + + /* + * With Xarray value set to NULL, holding the lru lock below + * prevents list_lru_{add,del,isolate} from touching the lru, + * safe to reparent. + */ for_each_node(i) - memcg_reparent_list_lru_node(lru, i, src_idx, parent); + memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed * to remain empty, we can safely free this lru, any further * memcg_list_lru_alloc() call will simply bail out. */ - memcg_list_lru_free(lru, src_idx); + kvfree_rcu(mlru, rcu); } mutex_unlock(&list_lrus_mutex); } @@ -472,77 +471,48 @@ static inline bool memcg_list_lru_allocated(struct mem_cgroup *memcg, int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, gfp_t gfp) { - int i; unsigned long flags; - struct list_lru_memcg_table { - struct list_lru_memcg *mlru; - struct mem_cgroup *memcg; - } *table; + struct list_lru_memcg *mlru; + struct mem_cgroup *pos, *parent; XA_STATE(xas, &lru->xa, 0); if (!list_lru_memcg_aware(lru) || memcg_list_lru_allocated(memcg, lru)) return 0; gfp &= GFP_RECLAIM_MASK; - table = kmalloc_array(memcg->css.cgroup->level, sizeof(*table), gfp); - if (!table) - return -ENOMEM; - /* * Because the list_lru can be reparented to the parent cgroup's * list_lru, we should make sure that this cgroup and all its * ancestors have allocated list_lru_memcg. */ - for (i = 0; memcg; memcg = parent_mem_cgroup(memcg), i++) { - if (memcg_list_lru_allocated(memcg, lru)) - break; - - table[i].memcg = memcg; - table[i].mlru = memcg_init_list_lru_one(gfp); - if (!table[i].mlru) { - while (i--) - kfree(table[i].mlru); - kfree(table); - return -ENOMEM; + do { + /* + * Keep finding the farest parent that wasn't populated + * until found memcg itself. + */ + pos = memcg; + parent = parent_mem_cgroup(pos); + while (!memcg_list_lru_allocated(parent, lru)) { + pos = parent; + parent = parent_mem_cgroup(pos); } - } - - xas_lock_irqsave(&xas, flags); - while (i--) { - int index = READ_ONCE(table[i].memcg->kmemcg_id); - struct list_lru_memcg *mlru = table[i].mlru; - xas_set(&xas, index); -retry: - if (unlikely(index < 0 || xas_error(&xas) || xas_load(&xas))) { - kfree(mlru); - } else { - xas_store(&xas, mlru); - if (xas_error(&xas) == -ENOMEM) { - xas_unlock_irqrestore(&xas, flags); - if (xas_nomem(&xas, gfp)) - xas_set_err(&xas, 0); - xas_lock_irqsave(&xas, flags); - /* - * The xas lock has been released, this memcg - * can be reparented before us. So reload - * memcg id. More details see the comments - * in memcg_reparent_list_lrus(). - */ - index = READ_ONCE(table[i].memcg->kmemcg_id); - if (index < 0) - xas_set_err(&xas, 0); - else if (!xas_error(&xas) && index != xas.xa_index) - xas_set(&xas, index); - goto retry; + mlru = memcg_init_list_lru_one(gfp); + if (!mlru) + return -ENOMEM; + xas_set(&xas, pos->kmemcg_id); + do { + xas_lock_irqsave(&xas, flags); + if (!xas_load(&xas) && !css_is_dying(&pos->css)) { + xas_store(&xas, mlru); + if (!xas_error(&xas)) + mlru = NULL; } - } - } - /* xas_nomem() is used to free memory instead of memory allocation. */ - if (xas.xa_alloc) - xas_nomem(&xas, gfp); - xas_unlock_irqrestore(&xas, flags); - kfree(table); + xas_unlock_irqrestore(&xas, flags); + } while (xas_nomem(&xas, gfp)); + if (mlru) + kfree(mlru); + } while (pos != memcg && !css_is_dying(&pos->css)); return xas_error(&xas); } diff --git a/mm/zswap.c b/mm/zswap.c index b68f80e1f8066..96aeb969d6d69 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -709,12 +709,11 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) /* * Note that it is safe to use rcu_read_lock() here, even in the face of - * concurrent memcg offlining. Thanks to the memcg->kmemcg_id indirection - * used in list_lru lookup, only two scenarios are possible: + * concurrent memcg offlining: * - * 1. list_lru_add() is called before memcg->kmemcg_id is updated. The + * 1. list_lru_add() is called before list_lru_memcg is erased. The * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after memcg->kmemcg_id is updated. The + * 2. list_lru_add() is called after list_lru_memcg is erased. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del(). From fb56fdf8b9a2f7397f8a83dce50189f3f0cf71af Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:56 +0800 Subject: [PATCH 203/215] mm/list_lru: split the lock to per-cgroup scope Currently, every list_lru has a per-node lock that protects adding, deletion, isolation, and reparenting of all list_lru_one instances belonging to this list_lru on this node. This lock contention is heavy when multiple cgroups modify the same list_lru. This lock can be split into per-cgroup scope to reduce contention. To achieve this, we need a stable list_lru_one for every cgroup. This commit adds a lock to each list_lru_one and introduced a helper function lock_list_lru_of_memcg, making it possible to pin the list_lru of a memcg. Then reworked the reparenting process. Reparenting will switch the list_lru_one instances one by one. By locking each instance and marking it dead using the nr_items counter, reparenting ensures that all items in the corresponding cgroup (on-list or not, because items have a stable cgroup, see below) will see the list_lru_one switch synchronously. Objcg reparent is also moved after list_lru reparent so items will have a stable mem cgroup until all list_lru_one instances are drained. The only caller that doesn't work the *_obj interfaces are direct calls to list_lru_{add,del}. But it's only used by zswap and that's also based on objcg, so it's fine. This also changes the bahaviour of the isolation function when LRU_RETRY or LRU_REMOVED_RETRY is returned, because now releasing the lock could unblock reparenting and free the list_lru_one, isolation function will have to return withoug re-lock the lru. prepare() { mkdir /tmp/test-fs modprobe brd rd_nr=1 rd_size=33554432 mkfs.xfs -f /dev/ram0 mount -t xfs /dev/ram0 /tmp/test-fs for i in $(seq 1 512); do mkdir "/tmp/test-fs/$i" for j in $(seq 1 10240); do echo TEST-CONTENT > "/tmp/test-fs/$i/$j" done & done; wait } do_test() { read_worker() { sleep 1 tar -cv "$1" &>/dev/null } read_in_all() { cd "/tmp/test-fs" && ls for i in $(seq 1 512); do (exec sh -c 'echo "$PPID"') > "/sys/fs/cgroup/benchmark/$i/cgroup.procs" read_worker "$i" & done; wait } for i in $(seq 1 512); do mkdir -p "/sys/fs/cgroup/benchmark/$i" done echo +memory > /sys/fs/cgroup/benchmark/cgroup.subtree_control echo 512M > /sys/fs/cgroup/benchmark/memory.max echo 3 > /proc/sys/vm/drop_caches time read_in_all } Above script simulates compression of small files in multiple cgroups with memory pressure. Run prepare() then do_test for 6 times: Before: real 0m7.762s user 0m11.340s sys 3m11.224s real 0m8.123s user 0m11.548s sys 3m2.549s real 0m7.736s user 0m11.515s sys 3m11.171s real 0m8.539s user 0m11.508s sys 3m7.618s real 0m7.928s user 0m11.349s sys 3m13.063s real 0m8.105s user 0m11.128s sys 3m14.313s After this commit (about ~15% faster): real 0m6.953s user 0m11.327s sys 2m42.912s real 0m7.453s user 0m11.343s sys 2m51.942s real 0m6.916s user 0m11.269s sys 2m43.957s real 0m6.894s user 0m11.528s sys 2m45.346s real 0m6.911s user 0m11.095s sys 2m43.168s real 0m6.773s user 0m11.518s sys 2m40.774s Link: https://lkml.kernel.org/r/20241104175257.60853-6-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 1 - fs/inode.c | 1 - fs/xfs/xfs_qm.c | 1 - include/linux/list_lru.h | 6 +- mm/list_lru.c | 216 +++++++++++++++++++-------------- mm/memcontrol.c | 7 +- mm/workingset.c | 1 - mm/zswap.c | 5 +- 8 files changed, 135 insertions(+), 103 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index b3acbc4174fb1..86bbe40f4bcdb 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1106,7 +1106,6 @@ enum lru_status binder_alloc_free_page(struct list_head *item, mmput_async(mm); __free_page(page_to_free); - spin_lock(lock); return LRU_REMOVED_RETRY; err_invalid_vma: diff --git a/fs/inode.c b/fs/inode.c index 8dabb224f941c..442cb4fc09b25 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -934,7 +934,6 @@ static enum lru_status inode_lru_isolate(struct list_head *item, mm_account_reclaimed_pages(reap); } inode_unpin_lru_isolating(inode); - spin_lock(lru_lock); return LRU_RETRY; } diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 7e2307921deb2..665d26990b789 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -496,7 +496,6 @@ xfs_qm_dquot_isolate( trace_xfs_dqreclaim_busy(dqp); XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses); xfs_dqunlock(dqp); - spin_lock(lru_lock); return LRU_RETRY; } diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index eba93f6511f31..10ba9a54d42c1 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -32,6 +32,8 @@ struct list_lru_one { struct list_head list; /* may become negative during memcg reparenting */ long nr_items; + /* protects all fields above */ + spinlock_t lock; }; struct list_lru_memcg { @@ -41,11 +43,9 @@ struct list_lru_memcg { }; struct list_lru_node { - /* protects all lists on the node, including per cgroup */ - spinlock_t lock; /* global list, used for the root cgroup in cgroup aware lrus */ struct list_lru_one lru; - long nr_items; + atomic_long_t nr_items; } ____cacheline_aligned_in_smp; struct list_lru { diff --git a/mm/list_lru.c b/mm/list_lru.c index 172b16146e157..c139202e27f7d 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -61,18 +61,51 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -list_lru_from_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) { struct list_lru_one *l; + long nr_items; + + rcu_read_lock(); again: l = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(memcg)); - if (likely(l)) - return l; - - memcg = parent_mem_cgroup(memcg); + if (likely(l)) { + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + nr_items = READ_ONCE(l->nr_items); + if (likely(nr_items != LONG_MIN)) { + WARN_ON(nr_items < 0); + rcu_read_unlock(); + return l; + } + if (irq) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); + } + /* + * Caller may simply bail out if raced with reparenting or + * may iterate through the list_lru and expect empty slots. + */ + if (skip_empty) { + rcu_read_unlock(); + return NULL; + } VM_WARN_ON(!css_is_dying(&memcg->css)); + memcg = parent_mem_cgroup(memcg); goto again; } + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); +} #else static void list_lru_register(struct list_lru *lru) { @@ -99,31 +132,48 @@ list_lru_from_memcg_idx(struct list_lru *lru, int nid, int idx) } static inline struct list_lru_one * -list_lru_from_memcg(struct list_lru *lru, int nid, int idx) +lock_list_lru_of_memcg(struct list_lru *lru, int nid, struct mem_cgroup *memcg, + bool irq, bool skip_empty) { - return &lru->node[nid].lru; + struct list_lru_one *l = &lru->node[nid].lru; + + if (irq) + spin_lock_irq(&l->lock); + else + spin_lock(&l->lock); + + return l; +} + +static inline void unlock_list_lru(struct list_lru_one *l, bool irq_off) +{ + if (irq_off) + spin_unlock_irq(&l->lock); + else + spin_unlock(&l->lock); } #endif /* CONFIG_MEMCG */ /* The caller must ensure the memcg lifetime. */ bool list_lru_add(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (list_empty(item)) { - l = list_lru_from_memcg(lru, nid, memcg); list_add_tail(item, &l->list); /* Set shrinker bit if the first element was added */ if (!l->nr_items++) set_shrinker_bit(memcg, nid, lru_shrinker_id(lru)); - nlru->nr_items++; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_inc(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } @@ -146,24 +196,23 @@ EXPORT_SYMBOL_GPL(list_lru_add_obj); /* The caller must ensure the memcg lifetime. */ bool list_lru_del(struct list_lru *lru, struct list_head *item, int nid, - struct mem_cgroup *memcg) + struct mem_cgroup *memcg) { struct list_lru_node *nlru = &lru->node[nid]; struct list_lru_one *l; - - spin_lock(&nlru->lock); + l = lock_list_lru_of_memcg(lru, nid, memcg, false, false); + if (!l) + return false; if (!list_empty(item)) { - l = list_lru_from_memcg(lru, nid, memcg); list_del_init(item); l->nr_items--; - nlru->nr_items--; - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); + atomic_long_dec(&nlru->nr_items); return true; } - spin_unlock(&nlru->lock); + unlock_list_lru(l, false); return false; } -EXPORT_SYMBOL_GPL(list_lru_del); bool list_lru_del_obj(struct list_lru *lru, struct list_head *item) { @@ -220,25 +269,24 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) struct list_lru_node *nlru; nlru = &lru->node[nid]; - return nlru->nr_items; + return atomic_long_read(&nlru->nr_items); } EXPORT_SYMBOL_GPL(list_lru_count_node); static unsigned long -__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, +__list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, - unsigned long *nr_to_walk) + unsigned long *nr_to_walk, bool irq_off) { struct list_lru_node *nlru = &lru->node[nid]; - struct list_lru_one *l; + struct list_lru_one *l = NULL; struct list_head *item, *n; unsigned long isolated = 0; restart: - l = list_lru_from_memcg_idx(lru, nid, memcg_idx); + l = lock_list_lru_of_memcg(lru, nid, memcg, irq_off, true); if (!l) - goto out; - + return isolated; list_for_each_safe(item, n, &l->list) { enum lru_status ret; @@ -250,19 +298,19 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, break; --*nr_to_walk; - ret = isolate(item, l, &nlru->lock, cb_arg); + ret = isolate(item, l, &l->lock, cb_arg); switch (ret) { + /* + * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru + * lock. List traversal will have to restart from scratch. + */ + case LRU_RETRY: + goto restart; case LRU_REMOVED_RETRY: - assert_spin_locked(&nlru->lock); fallthrough; case LRU_REMOVED: isolated++; - nlru->nr_items--; - /* - * If the lru lock has been dropped, our list - * traversal is now invalid and so we have to - * restart from scratch. - */ + atomic_long_dec(&nlru->nr_items); if (ret == LRU_REMOVED_RETRY) goto restart; break; @@ -271,20 +319,13 @@ __list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx, break; case LRU_SKIP: break; - case LRU_RETRY: - /* - * The lru lock has been dropped, our list traversal is - * now invalid and so we have to restart from scratch. - */ - assert_spin_locked(&nlru->lock); - goto restart; case LRU_STOP: - assert_spin_locked(&nlru->lock); goto out; default: BUG(); } } + unlock_list_lru(l, irq_off); out: return isolated; } @@ -294,14 +335,8 @@ list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, false); } EXPORT_SYMBOL_GPL(list_lru_walk_one); @@ -310,14 +345,8 @@ list_lru_walk_one_irq(struct list_lru *lru, int nid, struct mem_cgroup *memcg, list_lru_walk_cb isolate, void *cb_arg, unsigned long *nr_to_walk) { - struct list_lru_node *nlru = &lru->node[nid]; - unsigned long ret; - - spin_lock_irq(&nlru->lock); - ret = __list_lru_walk_one(lru, nid, memcg_kmem_id(memcg), isolate, - cb_arg, nr_to_walk); - spin_unlock_irq(&nlru->lock); - return ret; + return __list_lru_walk_one(lru, nid, memcg, isolate, + cb_arg, nr_to_walk, true); } unsigned long list_lru_walk_node(struct list_lru *lru, int nid, @@ -332,16 +361,21 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, #ifdef CONFIG_MEMCG if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) { struct list_lru_memcg *mlru; + struct mem_cgroup *memcg; unsigned long index; xa_for_each(&lru->xa, index, mlru) { - struct list_lru_node *nlru = &lru->node[nid]; - - spin_lock(&nlru->lock); - isolated += __list_lru_walk_one(lru, nid, index, + rcu_read_lock(); + memcg = mem_cgroup_from_id(index); + if (!mem_cgroup_tryget(memcg)) { + rcu_read_unlock(); + continue; + } + rcu_read_unlock(); + isolated += __list_lru_walk_one(lru, nid, memcg, isolate, cb_arg, - nr_to_walk); - spin_unlock(&nlru->lock); + nr_to_walk, false); + mem_cgroup_put(memcg); if (*nr_to_walk <= 0) break; @@ -353,14 +387,19 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, } EXPORT_SYMBOL_GPL(list_lru_walk_node); -static void init_one_lru(struct list_lru_one *l) +static void init_one_lru(struct list_lru *lru, struct list_lru_one *l) { INIT_LIST_HEAD(&l->list); + spin_lock_init(&l->lock); l->nr_items = 0; +#ifdef CONFIG_LOCKDEP + if (lru->key) + lockdep_set_class(&l->lock, lru->key); +#endif } #ifdef CONFIG_MEMCG -static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) +static struct list_lru_memcg *memcg_init_list_lru_one(struct list_lru *lru, gfp_t gfp) { int nid; struct list_lru_memcg *mlru; @@ -370,7 +409,7 @@ static struct list_lru_memcg *memcg_init_list_lru_one(gfp_t gfp) return NULL; for_each_node(nid) - init_one_lru(&mlru->node[nid]); + init_one_lru(lru, &mlru->node[nid]); return mlru; } @@ -398,28 +437,27 @@ static void memcg_destroy_list_lru(struct list_lru *lru) xas_unlock_irq(&xas); } -static void memcg_reparent_list_lru_node(struct list_lru *lru, int nid, - struct list_lru_one *src, - struct mem_cgroup *dst_memcg) +static void memcg_reparent_list_lru_one(struct list_lru *lru, int nid, + struct list_lru_one *src, + struct mem_cgroup *dst_memcg) { - struct list_lru_node *nlru = &lru->node[nid]; + int dst_idx = dst_memcg->kmemcg_id; struct list_lru_one *dst; - /* - * Since list_lru_{add,del} may be called under an IRQ-safe lock, - * we have to use IRQ-safe primitives here to avoid deadlock. - */ - spin_lock_irq(&nlru->lock); - dst = list_lru_from_memcg_idx(lru, nid, memcg_kmem_id(dst_memcg)); + spin_lock_irq(&src->lock); + dst = list_lru_from_memcg_idx(lru, nid, dst_idx); + spin_lock_nested(&dst->lock, SINGLE_DEPTH_NESTING); list_splice_init(&src->list, &dst->list); - if (src->nr_items) { dst->nr_items += src->nr_items; set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru)); - src->nr_items = 0; } - spin_unlock_irq(&nlru->lock); + /* Mark the list_lru_one dead */ + src->nr_items = LONG_MIN; + + spin_unlock(&dst->lock); + spin_unlock_irq(&src->lock); } void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *parent) @@ -448,7 +486,7 @@ void memcg_reparent_list_lrus(struct mem_cgroup *memcg, struct mem_cgroup *paren * safe to reparent. */ for_each_node(i) - memcg_reparent_list_lru_node(lru, i, &mlru->node[i], parent); + memcg_reparent_list_lru_one(lru, i, &mlru->node[i], parent); /* * Here all list_lrus corresponding to the cgroup are guaranteed @@ -497,7 +535,7 @@ int memcg_list_lru_alloc(struct mem_cgroup *memcg, struct list_lru *lru, parent = parent_mem_cgroup(pos); } - mlru = memcg_init_list_lru_one(gfp); + mlru = memcg_init_list_lru_one(lru, gfp); if (!mlru) return -ENOMEM; xas_set(&xas, pos->kmemcg_id); @@ -544,14 +582,8 @@ int __list_lru_init(struct list_lru *lru, bool memcg_aware, struct shrinker *shr if (!lru->node) return -ENOMEM; - for_each_node(i) { - spin_lock_init(&lru->node[i].lock); -#ifdef CONFIG_LOCKDEP - if (lru->key) - lockdep_set_class(&lru->node[i].lock, lru->key); -#endif - init_one_lru(&lru->node[i].lru); - } + for_each_node(i) + init_one_lru(lru, &lru->node[i].lru); memcg_init_list_lru(lru, memcg_aware); list_lru_register(lru); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8b9061b58a728..ed2dd88437cab 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3110,8 +3110,13 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; - memcg_reparent_objcgs(memcg, parent); memcg_reparent_list_lrus(memcg, parent); + + /* + * Objcg's reparenting must be after list_lru's, make sure list_lru + * helpers won't use parent's list_lru until child is drained. + */ + memcg_reparent_objcgs(memcg, parent); } #ifdef CONFIG_CGROUP_WRITEBACK diff --git a/mm/workingset.c b/mm/workingset.c index 5c8861edbf171..c187d4a3fbeae 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -767,7 +767,6 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, ret = LRU_REMOVED_RETRY; out: cond_resched(); - spin_lock_irq(lru_lock); return ret; } diff --git a/mm/zswap.c b/mm/zswap.c index 96aeb969d6d69..ba35e45509414 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -711,9 +711,9 @@ static void zswap_lru_add(struct list_lru *list_lru, struct zswap_entry *entry) * Note that it is safe to use rcu_read_lock() here, even in the face of * concurrent memcg offlining: * - * 1. list_lru_add() is called before list_lru_memcg is erased. The + * 1. list_lru_add() is called before list_lru_one is dead. The * new entry will be reparented to memcg's parent's list_lru. - * 2. list_lru_add() is called after list_lru_memcg is erased. The + * 2. list_lru_add() is called after list_lru_one is dead. The * new entry will be added directly to memcg's parent's list_lru. * * Similar reasoning holds for list_lru_del(). @@ -1179,7 +1179,6 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o zswap_written_back_pages++; } - spin_lock(lock); return ret; } From da0c02516c501b43bd39ad4aca5779c86153d143 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 5 Nov 2024 01:52:57 +0800 Subject: [PATCH 204/215] mm/list_lru: simplify the list_lru walk callback function Now isolation no longer takes the list_lru global node lock, only use the per-cgroup lock instead. And this lock is inside the list_lru_one being walked, no longer needed to pass the lock explicitly. Link: https://lkml.kernel.org/r/20241104175257.60853-7-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Chengming Zhou Cc: Johannes Weiner Cc: Matthew Wilcox (Oracle) Cc: Michal Hocko Cc: Muchun Song Cc: Qi Zheng Cc: Roman Gushchin Cc: Shakeel Butt Cc: Waiman Long Signed-off-by: Andrew Morton --- drivers/android/binder_alloc.c | 7 +++---- drivers/android/binder_alloc.h | 2 +- fs/dcache.c | 4 ++-- fs/gfs2/quota.c | 2 +- fs/inode.c | 4 ++-- fs/nfs/nfs42xattr.c | 4 ++-- fs/nfsd/filecache.c | 5 +---- fs/xfs/xfs_buf.c | 2 -- fs/xfs/xfs_qm.c | 5 ++--- include/linux/list_lru.h | 2 +- mm/list_lru.c | 2 +- mm/workingset.c | 15 +++++++-------- mm/zswap.c | 4 ++-- 13 files changed, 25 insertions(+), 33 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 86bbe40f4bcdb..a738e77458658 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -1047,7 +1047,7 @@ void binder_alloc_vma_close(struct binder_alloc *alloc) /** * binder_alloc_free_page() - shrinker callback to free pages * @item: item to free - * @lock: lock protecting the item + * @lru: list_lru instance of the item * @cb_arg: callback argument * * Called from list_lru_walk() in binder_shrink_scan() to free @@ -1055,9 +1055,8 @@ void binder_alloc_vma_close(struct binder_alloc *alloc) */ enum lru_status binder_alloc_free_page(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lock, void *cb_arg) - __must_hold(lock) + __must_hold(&lru->lock) { struct binder_lru_page *page = container_of(item, typeof(*page), lru); struct binder_alloc *alloc = page->alloc; @@ -1092,7 +1091,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, list_lru_isolate(lru, item); spin_unlock(&alloc->lock); - spin_unlock(lock); + spin_unlock(&lru->lock); if (vma) { trace_binder_unmap_user_start(alloc, index); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 70387234477e0..c02c8ebcb466b 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -118,7 +118,7 @@ static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} #endif enum lru_status binder_alloc_free_page(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lock, void *cb_arg); + void *cb_arg); struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, diff --git a/fs/dcache.c b/fs/dcache.c index 0f6b16ba30d08..d7f6866f5f523 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1089,7 +1089,7 @@ void shrink_dentry_list(struct list_head *list) } static enum lru_status dentry_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); @@ -1170,7 +1170,7 @@ long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc) } static enum lru_status dentry_lru_isolate_shrink(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct dentry *dentry = container_of(item, struct dentry, d_lru); diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 2e6bc77f4f81c..72b48f6f55617 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -149,7 +149,7 @@ static void gfs2_qd_list_dispose(struct list_head *list) static enum lru_status gfs2_qd_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct gfs2_quota_data *qd = diff --git a/fs/inode.c b/fs/inode.c index 442cb4fc09b25..46fbd5b234822 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -881,7 +881,7 @@ void invalidate_inodes(struct super_block *sb) * with this flag set because they are the inodes that are out of order. */ static enum lru_status inode_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *freeable = arg; struct inode *inode = container_of(item, struct inode, i_lru); @@ -923,7 +923,7 @@ static enum lru_status inode_lru_isolate(struct list_head *item, if (inode_has_buffers(inode) || !mapping_empty(&inode->i_data)) { inode_pin_lru_isolating(inode); spin_unlock(&inode->i_lock); - spin_unlock(lru_lock); + spin_unlock(&lru->lock); if (remove_inode_buffers(inode)) { unsigned long reap; reap = invalidate_mapping_pages(&inode->i_data, 0, -1); diff --git a/fs/nfs/nfs42xattr.c b/fs/nfs/nfs42xattr.c index b6e3d8f77b910..37d79400e5f42 100644 --- a/fs/nfs/nfs42xattr.c +++ b/fs/nfs/nfs42xattr.c @@ -802,7 +802,7 @@ static struct shrinker *nfs4_xattr_large_entry_shrinker; static enum lru_status cache_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct inode *inode; @@ -867,7 +867,7 @@ nfs4_xattr_cache_count(struct shrinker *shrink, struct shrink_control *sc) static enum lru_status entry_lru_isolate(struct list_head *item, - struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) + struct list_lru_one *lru, void *arg) { struct list_head *dispose = arg; struct nfs4_xattr_bucket *bucket; diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 2e6783f637124..09c444eb944fe 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -487,7 +487,6 @@ void nfsd_file_net_dispose(struct nfsd_net *nn) * nfsd_file_lru_cb - Examine an entry on the LRU list * @item: LRU entry to examine * @lru: controlling LRU - * @lock: LRU list lock (unused) * @arg: dispose list * * Return values: @@ -497,9 +496,7 @@ void nfsd_file_net_dispose(struct nfsd_net *nn) */ static enum lru_status nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lock, void *arg) - __releases(lock) - __acquires(lock) + void *arg) { struct list_head *head = arg; struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index aa4dbda7b5365..43b914c1f6212 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1857,7 +1857,6 @@ static enum lru_status xfs_buftarg_drain_rele( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { @@ -1956,7 +1955,6 @@ static enum lru_status xfs_buftarg_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) { struct xfs_buf *bp = container_of(item, struct xfs_buf, b_lru); diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 665d26990b789..8413ac368042a 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -412,9 +412,8 @@ static enum lru_status xfs_qm_dquot_isolate( struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, void *arg) - __releases(lru_lock) __acquires(lru_lock) + __releases(&lru->lock) __acquires(&lru->lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); @@ -460,7 +459,7 @@ xfs_qm_dquot_isolate( trace_xfs_dqreclaim_dirty(dqp); /* we have to drop the LRU lock to flush the dquot */ - spin_unlock(lru_lock); + spin_unlock(&lru->lock); error = xfs_qm_dqflush(dqp, &bp); if (error) diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index 10ba9a54d42c1..05c166811f6ba 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -184,7 +184,7 @@ void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item, struct list_head *head); typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item, - struct list_lru_one *list, spinlock_t *lock, void *cb_arg); + struct list_lru_one *list, void *cb_arg); /** * list_lru_walk_one: walk a @lru, isolating and disposing freeable items. diff --git a/mm/list_lru.c b/mm/list_lru.c index c139202e27f7d..f93ada6a207b1 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -298,7 +298,7 @@ __list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg, break; --*nr_to_walk; - ret = isolate(item, l, &l->lock, cb_arg); + ret = isolate(item, l, cb_arg); switch (ret) { /* * LRU_RETRY, LRU_REMOVED_RETRY and LRU_STOP will drop the lru diff --git a/mm/workingset.c b/mm/workingset.c index c187d4a3fbeae..a4705e196545e 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -702,8 +702,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, static enum lru_status shadow_lru_isolate(struct list_head *item, struct list_lru_one *lru, - spinlock_t *lru_lock, - void *arg) __must_hold(lru_lock) + void *arg) __must_hold(lru->lock) { struct xa_node *node = container_of(item, struct xa_node, private_list); struct address_space *mapping; @@ -712,20 +711,20 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, /* * Page cache insertions and deletions synchronously maintain * the shadow node LRU under the i_pages lock and the - * lru_lock. Because the page cache tree is emptied before - * the inode can be destroyed, holding the lru_lock pins any + * &lru->lock. Because the page cache tree is emptied before + * the inode can be destroyed, holding the &lru->lock pins any * address_space that has nodes on the LRU. * * We can then safely transition to the i_pages lock to * pin only the address_space of the particular node we want - * to reclaim, take the node off-LRU, and drop the lru_lock. + * to reclaim, take the node off-LRU, and drop the &lru->lock. */ mapping = container_of(node->array, struct address_space, i_pages); /* Coming from the list, invert the lock order */ if (!xa_trylock(&mapping->i_pages)) { - spin_unlock_irq(lru_lock); + spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } @@ -734,7 +733,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, if (mapping->host != NULL) { if (!spin_trylock(&mapping->host->i_lock)) { xa_unlock(&mapping->i_pages); - spin_unlock_irq(lru_lock); + spin_unlock_irq(&lru->lock); ret = LRU_RETRY; goto out; } @@ -743,7 +742,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, list_lru_isolate(lru, item); __dec_node_page_state(virt_to_page(node), WORKINGSET_NODES); - spin_unlock(lru_lock); + spin_unlock(&lru->lock); /* * The nodes should only contain one or more shadow entries, diff --git a/mm/zswap.c b/mm/zswap.c index ba35e45509414..f6316b66fb236 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1102,7 +1102,7 @@ static int zswap_writeback_entry(struct zswap_entry *entry, * for reclaim by this ratio. */ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_one *l, - spinlock_t *lock, void *arg) + void *arg) { struct zswap_entry *entry = container_of(item, struct zswap_entry, lru); bool *encountered_page_in_swapcache = (bool *)arg; @@ -1158,7 +1158,7 @@ static enum lru_status shrink_memcg_cb(struct list_head *item, struct list_lru_o * It's safe to drop the lock here because we return either * LRU_REMOVED_RETRY or LRU_RETRY. */ - spin_unlock(lock); + spin_unlock(&l->lock); writeback_result = zswap_writeback_entry(entry, swpentry); From 7591c127f3b17d5879f18819cad7058bf3a2e276 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Mon, 4 Nov 2024 11:19:44 +0000 Subject: [PATCH 205/215] kmemleak: iommu/iova: fix transient kmemleak false positive The introduction of iova_depot_pop() in 911aa1245da8 ("iommu/iova: Make the rcache depot scale better") confused kmemleak by moving a struct iova_magazine object from a singly linked list to rcache->depot and resetting the 'next' pointer referencing it. Unlike doubly linked lists, the content of the object being referred is never changed on removal from a singly linked list and the kmemleak checksum heuristics do not detect such scenario. This leads to false positives like: unreferenced object 0xffff8881a5301000 (size 1024): comm "softirq", pid 0, jiffies 4306297099 (age 462.991s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 e7 7d 05 00 00 00 00 00 .........}...... 0f b4 05 00 00 00 00 00 b4 96 05 00 00 00 00 00 ................ backtrace: [] __kmem_cache_alloc_node+0x1e8/0x320 [] kmalloc_trace+0x2a/0x60 [] free_iova_fast+0x28e/0x4e0 [] fq_ring_free_locked+0x1b0/0x310 [] fq_flush_timeout+0x19d/0x2e0 [] call_timer_fn+0x19a/0x5c0 [] __run_timers+0x78b/0xb80 [] run_timer_softirq+0x5d/0xd0 [] __do_softirq+0x205/0x8b5 Introduce kmemleak_transient_leak() which resets the object checksum requiring another scan pass before it is reported (if still unreferenced). Call this new API in iova_depot_pop(). Link: https://lkml.kernel.org/r/20241104111944.2207155-1-catalin.marinas@arm.com Link: https://lore.kernel.org/r/ZY1osaGLyT-sdKE8@shredder/ Signed-off-by: Catalin Marinas Reported-by: Ido Schimmel Tested-by: Ido Schimmel Acked-by: Robin Murphy Cc: Joerg Roedel Cc: Will Deacon Signed-off-by: Andrew Morton --- Documentation/dev-tools/kmemleak.rst | 1 + drivers/iommu/iova.c | 6 +++++ include/linux/kmemleak.h | 4 +++ mm/kmemleak.c | 39 ++++++++++++++++++++++++++++ 4 files changed, 50 insertions(+) diff --git a/Documentation/dev-tools/kmemleak.rst b/Documentation/dev-tools/kmemleak.rst index 2cb00b53339fe..7d784e03f3f9d 100644 --- a/Documentation/dev-tools/kmemleak.rst +++ b/Documentation/dev-tools/kmemleak.rst @@ -161,6 +161,7 @@ See the include/linux/kmemleak.h header for the functions prototype. - ``kmemleak_free_percpu`` - notify of a percpu memory block freeing - ``kmemleak_update_trace`` - update object allocation stack trace - ``kmemleak_not_leak`` - mark an object as not a leak +- ``kmemleak_transient_leak`` - mark an object as a transient leak - ``kmemleak_ignore`` - do not scan or report an object as leak - ``kmemleak_scan_area`` - add scan areas inside a memory block - ``kmemleak_no_scan`` - do not scan a memory block diff --git a/drivers/iommu/iova.c b/drivers/iommu/iova.c index 16c6adff3eb7b..5b5400efb6577 100644 --- a/drivers/iommu/iova.c +++ b/drivers/iommu/iova.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -673,6 +674,11 @@ static struct iova_magazine *iova_depot_pop(struct iova_rcache *rcache) { struct iova_magazine *mag = rcache->depot; + /* + * As the mag->next pointer is moved to rcache->depot and reset via + * the mag->size assignment, mark it as a transient false positive. + */ + kmemleak_transient_leak(mag->next); rcache->depot = mag->next; mag->size = IOVA_MAG_SIZE; rcache->depot_size--; diff --git a/include/linux/kmemleak.h b/include/linux/kmemleak.h index 6a3cd1bf4680b..93a73c076d169 100644 --- a/include/linux/kmemleak.h +++ b/include/linux/kmemleak.h @@ -26,6 +26,7 @@ extern void kmemleak_free_part(const void *ptr, size_t size) __ref; extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; extern void kmemleak_update_trace(const void *ptr) __ref; extern void kmemleak_not_leak(const void *ptr) __ref; +extern void kmemleak_transient_leak(const void *ptr) __ref; extern void kmemleak_ignore(const void *ptr) __ref; extern void kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) __ref; extern void kmemleak_no_scan(const void *ptr) __ref; @@ -93,6 +94,9 @@ static inline void kmemleak_update_trace(const void *ptr) static inline void kmemleak_not_leak(const void *ptr) { } +static inline void kmemleak_transient_leak(const void *ptr) +{ +} static inline void kmemleak_ignore(const void *ptr) { } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 17006d8a2afae..2a945c07ae995 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -934,6 +934,28 @@ static void make_black_object(unsigned long ptr, unsigned int objflags) paint_ptr(ptr, KMEMLEAK_BLACK, objflags); } +/* + * Reset the checksum of an object. The immediate effect is that it will not + * be reported as a leak during the next scan until its checksum is updated. + */ +static void reset_checksum(unsigned long ptr) +{ + unsigned long flags; + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Not resetting the checksum of an unknown object at 0x%08lx\n", + ptr); + return; + } + + raw_spin_lock_irqsave(&object->lock, flags); + object->checksum = 0; + raw_spin_unlock_irqrestore(&object->lock, flags); + put_object(object); +} + /* * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. @@ -1202,6 +1224,23 @@ void __ref kmemleak_not_leak(const void *ptr) } EXPORT_SYMBOL(kmemleak_not_leak); +/** + * kmemleak_transient_leak - mark an allocated object as transient false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to not be + * reported as a leak temporarily. This may happen, for example, if the object + * is part of a singly linked list and the ->next reference to it is changed. + */ +void __ref kmemleak_transient_leak(const void *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + reset_checksum((unsigned long)ptr); +} +EXPORT_SYMBOL(kmemleak_transient_leak); + /** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object From 7269ed4af344184ab9bdf318fe8864cf64849735 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Mon, 4 Nov 2024 15:07:12 +0800 Subject: [PATCH 206/215] mm: define general function pXd_init() pud_init(), pmd_init() and kernel_pte_init() are duplicated defined in file kasan.c and sparse-vmemmap.c as weak functions. Move them to generic header file pgtable.h, architecture can redefine them. Link: https://lkml.kernel.org/r/20241104070712.52902-1-maobibo@loongson.cn Signed-off-by: Bibo Mao Reviewed-by: Huacai Chen Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Thomas Bogendoerfer Cc: Vincenzo Frascino Cc: WANG Xuerui Signed-off-by: Andrew Morton --- arch/loongarch/include/asm/pgtable.h | 3 +++ arch/mips/include/asm/pgtable-64.h | 2 ++ include/linux/mm.h | 3 --- include/linux/pgtable.h | 21 +++++++++++++++++++++ mm/kasan/init.c | 12 ------------ mm/sparse-vmemmap.c | 12 ------------ 6 files changed, 26 insertions(+), 27 deletions(-) diff --git a/arch/loongarch/include/asm/pgtable.h b/arch/loongarch/include/asm/pgtable.h index 20714b73f14c8..da346733a1dae 100644 --- a/arch/loongarch/include/asm/pgtable.h +++ b/arch/loongarch/include/asm/pgtable.h @@ -268,8 +268,11 @@ extern void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pm */ extern void pgd_init(void *addr); extern void pud_init(void *addr); +#define pud_init pud_init extern void pmd_init(void *addr); +#define pmd_init pmd_init extern void kernel_pte_init(void *addr); +#define kernel_pte_init kernel_pte_init /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/arch/mips/include/asm/pgtable-64.h b/arch/mips/include/asm/pgtable-64.h index 401c1d9e4409a..6e854bb11f37d 100644 --- a/arch/mips/include/asm/pgtable-64.h +++ b/arch/mips/include/asm/pgtable-64.h @@ -317,7 +317,9 @@ static inline pmd_t *pud_pgtable(pud_t pud) */ extern void pgd_init(void *addr); extern void pud_init(void *addr); +#define pud_init pud_init extern void pmd_init(void *addr); +#define pmd_init pmd_init /* * Encode/decode swap entries and swap PTEs. Swap PTEs are all PTEs that diff --git a/include/linux/mm.h b/include/linux/mm.h index 32888a97ab44e..5d6cd523c7c09 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3819,9 +3819,6 @@ void *sparse_buffer_alloc(unsigned long size); struct page * __populate_section_memmap(unsigned long pfn, unsigned long nr_pages, int nid, struct vmem_altmap *altmap, struct dev_pagemap *pgmap); -void pud_init(void *addr); -void pmd_init(void *addr); -void kernel_pte_init(void *addr); pgd_t *vmemmap_pgd_populate(unsigned long addr, int node); p4d_t *vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node); pud_t *vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node); diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index 23aeffd89a4e0..adef9d6e9b1ba 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -90,6 +90,27 @@ static inline unsigned long pud_index(unsigned long address) #define pgd_index(a) (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1)) #endif +#ifndef kernel_pte_init +static inline void kernel_pte_init(void *addr) +{ +} +#define kernel_pte_init kernel_pte_init +#endif + +#ifndef pmd_init +static inline void pmd_init(void *addr) +{ +} +#define pmd_init pmd_init +#endif + +#ifndef pud_init +static inline void pud_init(void *addr) +{ +} +#define pud_init pud_init +#endif + #ifndef pte_offset_kernel static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address) { diff --git a/mm/kasan/init.c b/mm/kasan/init.c index ac607c306292f..ced6b29fcf763 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -106,10 +106,6 @@ static void __ref zero_pte_populate(pmd_t *pmd, unsigned long addr, } } -void __weak __meminit kernel_pte_init(void *addr) -{ -} - static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, unsigned long end) { @@ -145,10 +141,6 @@ static int __ref zero_pmd_populate(pud_t *pud, unsigned long addr, return 0; } -void __weak __meminit pmd_init(void *addr) -{ -} - static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long end) { @@ -187,10 +179,6 @@ static int __ref zero_pud_populate(p4d_t *p4d, unsigned long addr, return 0; } -void __weak __meminit pud_init(void *addr) -{ -} - static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, unsigned long end) { diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index c0388b2e959da..cec67c5f37d88 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -184,10 +184,6 @@ static void * __meminit vmemmap_alloc_block_zero(unsigned long size, int node) return p; } -void __weak __meminit kernel_pte_init(void *addr) -{ -} - pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) { pmd_t *pmd = pmd_offset(pud, addr); @@ -201,10 +197,6 @@ pmd_t * __meminit vmemmap_pmd_populate(pud_t *pud, unsigned long addr, int node) return pmd; } -void __weak __meminit pmd_init(void *addr) -{ -} - pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) { pud_t *pud = pud_offset(p4d, addr); @@ -218,10 +210,6 @@ pud_t * __meminit vmemmap_pud_populate(p4d_t *p4d, unsigned long addr, int node) return pud; } -void __weak __meminit pud_init(void *addr) -{ -} - p4d_t * __meminit vmemmap_p4d_populate(pgd_t *pgd, unsigned long addr, int node) { p4d_t *p4d = p4d_offset(pgd, addr); From e19175909180cf0affb9d8649cb234fbd91070b0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 1 Nov 2024 13:35:57 -0700 Subject: [PATCH 207/215] Docs/mm/damon: recommend academic papers to read and/or cite Kernel documentation is the most up-to-date and recommended resource for DAMON. It doesn't cover non-kernel part of the entire project[1], though. Also it is not optimum for formal long-term citations. Depending on cases, DAMON academic papers[2,3] could be better to be read and cited. However, there is no clear guidance for those. Add a paragraph for DAMON academic papers on the kernel documentation for DAMON. [1] https://damonitor.github.io [2] https://dl.acm.org/doi/abs/10.1145/3366626.3368125 [3] https://dl.acm.org/doi/abs/10.1145/3502181.353146 Link: https://lkml.kernel.org/r/20241101203557.55210-1-sj@kernel.org Signed-off-by: SeongJae Park Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- Documentation/mm/damon/index.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Documentation/mm/damon/index.rst b/Documentation/mm/damon/index.rst index dafd6d0289243..5a3359704ccea 100644 --- a/Documentation/mm/damon/index.rst +++ b/Documentation/mm/damon/index.rst @@ -37,3 +37,9 @@ with no code but simple configurations. To utilize and control DAMON from the user-space, please refer to the administration :doc:`guide `. + +If you prefer academic papers for reading and citations, please use the papers +from `HPDC'22 `_ and +`Middleware19 Industry `_ . +Note that those cover DAMON implementations in Linux v5.16 and v5.15, +respectively. From e51e10fadb2a5d3fcf12c79cc0fa7171286d5836 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 1 Nov 2024 13:23:10 -0700 Subject: [PATCH 208/215] MAINTAINERS/MEMORY MANAGEMENT: add document files for mm Memory managment subsystem documents ('admin-guide/mm/' and 'mm/' under 'Documentation/') are not marked as managed under memory management subsystem. This makes 'get_maintainer.pl' for changes to the documents sub-optimal. Mark the documents as part of mm subsystem on MAINTAINERS file. Link: https://lkml.kernel.org/r/20241101202311.53935-1-sj@kernel.org Signed-off-by: SeongJae Park Acked-by: David Hildenbrand Acked-by: Mike Rapoport (Microsoft) Cc: Jonathan Corbet Signed-off-by: Andrew Morton --- MAINTAINERS | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 27005b0fada97..0383fd7e0a40c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14848,6 +14848,8 @@ S: Maintained W: http://www.linux-mm.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm T: quilt git://git.kernel.org/pub/scm/linux/kernel/git/akpm/25-new +F: Documentation/admin-guide/mm/ +F: Documentation/mm/ F: include/linux/gfp.h F: include/linux/gfp_types.h F: include/linux/memfd.h From 9f3310ccc71efff041fed3f8be5ad19b0feab30b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 5 Nov 2024 12:50:35 +0100 Subject: [PATCH 209/215] zram: ZRAM_DEF_COMP should depend on ZRAM When Compressed RAM block device support is disabled, the CONFIG_ZRAM_DEF_COMP symbol still ends up in the generated config file: CONFIG_ZRAM_DEF_COMP="unset-value" While this causes no real harm, avoid polluting the config file by adding a dependency on ZRAM. Link: https://lkml.kernel.org/r/64e05bad68a9bd5cc322efd114a04d25de525940.1730807319.git.geert@linux-m68k.org Fixes: 917a59e81c34 ("zram: introduce custom comp backends API") Signed-off-by: Geert Uytterhoeven Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 6aea609b795c2..402b7b1758632 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -94,6 +94,7 @@ endchoice config ZRAM_DEF_COMP string + depends on ZRAM default "lzo-rle" if ZRAM_DEF_COMP_LZORLE default "lzo" if ZRAM_DEF_COMP_LZO default "lz4" if ZRAM_DEF_COMP_LZ4 From 9b5c87d47949292ff21ee2fadd1e83820662a430 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Tue, 5 Nov 2024 12:34:57 +0100 Subject: [PATCH 210/215] mm: mmap_lock: check trace_mmap_lock_$type_enabled() instead of regcount Since 7d6be67cfdd4 ("mm: mmap_lock: replace get_memcg_path_buf() with on-stack buffer") we use trace_mmap_lock_reg()/unreg() only to maintain an atomic reg_refcount which is checked to avoid performing get_mm_memcg_path() in case none of the tracepoints using it is enabled. This can be achieved directly by putting all the work needed for the tracepoint behind the trace_mmap_lock_##type##_enabled(), as suggested by Documentation/trace/tracepoints.rst and with the following advantages: - uses the tracepoint's static key instead of evaluating a branch - the check tracepoint specific, not shared by all of them - we can get rid of trace_mmap_lock_reg()/unreg() completely Thus use the trace_..._enabled() check and remove unnecessary code. Link: https://lkml.kernel.org/r/20241105113456.95066-2-vbabka@suse.cz Signed-off-by: Vlastimil Babka Reviewed-by: Axel Rasmussen Cc: Tetsuo Handa Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton --- include/trace/events/mmap_lock.h | 14 ++++-------- mm/mmap_lock.c | 39 +++++++------------------------- 2 files changed, 12 insertions(+), 41 deletions(-) diff --git a/include/trace/events/mmap_lock.h b/include/trace/events/mmap_lock.h index f2827f98a44f3..bc2e3ad787b3b 100644 --- a/include/trace/events/mmap_lock.h +++ b/include/trace/events/mmap_lock.h @@ -10,9 +10,6 @@ struct mm_struct; -extern int trace_mmap_lock_reg(void); -extern void trace_mmap_lock_unreg(void); - DECLARE_EVENT_CLASS(mmap_lock, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write), @@ -40,16 +37,15 @@ DECLARE_EVENT_CLASS(mmap_lock, ); #define DEFINE_MMAP_LOCK_EVENT(name) \ - DEFINE_EVENT_FN(mmap_lock, name, \ + DEFINE_EVENT(mmap_lock, name, \ TP_PROTO(struct mm_struct *mm, const char *memcg_path, \ bool write), \ - TP_ARGS(mm, memcg_path, write), \ - trace_mmap_lock_reg, trace_mmap_lock_unreg) + TP_ARGS(mm, memcg_path, write)) DEFINE_MMAP_LOCK_EVENT(mmap_lock_start_locking); DEFINE_MMAP_LOCK_EVENT(mmap_lock_released); -TRACE_EVENT_FN(mmap_lock_acquire_returned, +TRACE_EVENT(mmap_lock_acquire_returned, TP_PROTO(struct mm_struct *mm, const char *memcg_path, bool write, bool success), @@ -76,9 +72,7 @@ TRACE_EVENT_FN(mmap_lock_acquire_returned, __get_str(memcg_path), __entry->write ? "true" : "false", __entry->success ? "true" : "false" - ), - - trace_mmap_lock_reg, trace_mmap_lock_unreg + ) ); #endif /* _TRACE_MMAP_LOCK_H */ diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c index 368b840e75082..f186d57df2c68 100644 --- a/mm/mmap_lock.c +++ b/mm/mmap_lock.c @@ -19,43 +19,23 @@ EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released); #ifdef CONFIG_MEMCG -static atomic_t reg_refcount; - /* * Size of the buffer for memcg path names. Ignoring stack trace support, * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it. */ #define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL -int trace_mmap_lock_reg(void) -{ - atomic_inc(®_refcount); - return 0; -} - -void trace_mmap_lock_unreg(void) -{ - atomic_dec(®_refcount); -} - -#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ - do { \ - char buf[MEMCG_PATH_BUF_SIZE]; \ - get_mm_memcg_path(mm, buf, sizeof(buf)); \ - trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ +#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ + do { \ + if (trace_mmap_lock_##type##_enabled()) { \ + char buf[MEMCG_PATH_BUF_SIZE]; \ + get_mm_memcg_path(mm, buf, sizeof(buf)); \ + trace_mmap_lock_##type(mm, buf, ##__VA_ARGS__); \ + } \ } while (0) #else /* !CONFIG_MEMCG */ -int trace_mmap_lock_reg(void) -{ - return 0; -} - -void trace_mmap_lock_unreg(void) -{ -} - #define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__) @@ -65,16 +45,13 @@ void trace_mmap_lock_unreg(void) #ifdef CONFIG_MEMCG /* * Write the given mm_struct's memcg path to a buffer. If the path cannot be - * determined or the trace event is being unregistered, empty string is written. + * determined, empty string is written. */ static void get_mm_memcg_path(struct mm_struct *mm, char *buf, size_t buflen) { struct mem_cgroup *memcg; buf[0] = '\0'; - /* No need to get path if no trace event is registered. */ - if (!atomic_read(®_refcount)) - return; memcg = get_mem_cgroup_from_mm(mm); if (memcg == NULL) return; From 2ea80b039b9af0b71c00378523b71c254fb99c23 Mon Sep 17 00:00:00 2001 From: MengEn Sun Date: Fri, 1 Nov 2024 12:06:38 +0800 Subject: [PATCH 211/215] vmstat: call fold_vm_zone_numa_events() before show per zone NUMA event Since 5.14-rc1, NUMA events will only be folded from per-CPU statistics to per zone and global statistics when the user actually needs it. Currently, the kernel has performs the fold operation when reading /proc/vmstat, but does not perform the fold operation in /proc/zoneinfo. This can lead to inaccuracies in the following statistics in zoneinfo: - numa_hit - numa_miss - numa_foreign - numa_interleave - numa_local - numa_other Therefore, before printing per-zone vm_numa_event when reading /proc/zoneinfo, we should also perform the fold operation. Link: https://lkml.kernel.org/r/1730433998-10461-1-git-send-email-mengensun@tencent.com Fixes: f19298b9516c ("mm/vmstat: convert NUMA statistics to basic NUMA counters") Signed-off-by: MengEn Sun Reviewed-by: JinLiang Zheng Cc: Signed-off-by: Andrew Morton --- mm/vmstat.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/vmstat.c b/mm/vmstat.c index 3d82bb906dcf2..11a37c528395e 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1780,6 +1780,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone_page_state(zone, i)); #ifdef CONFIG_NUMA + fold_vm_zone_numa_events(zone); for (i = 0; i < NR_VM_NUMA_EVENT_ITEMS; i++) seq_printf(m, "\n %-12s %lu", numa_stat_name(i), zone_numa_event_state(zone, i)); From 05d4532b60e3e6e2a094ec56a88d1def50bd2430 Mon Sep 17 00:00:00 2001 From: Joshua Hahn Date: Fri, 1 Nov 2024 13:44:02 -0700 Subject: [PATCH 212/215] memcg/hugetlb: add hugeTLB counters to memcg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch introduces a new counter to memory.stat that tracks hugeTLB usage, only if hugeTLB accounting is done to memory.current. This feature is enabled the same way hugeTLB accounting is enabled, via the memory_hugetlb_accounting mount flag for cgroupsv2. 1. Why is this patch necessary? Currently, memcg hugeTLB accounting is an opt-in feature [1] that adds hugeTLB usage to memory.current. However, the metric is not reported in memory.stat. Given that users often interpret memory.stat as a breakdown of the value reported in memory.current, the disparity between the two reports can be confusing. This patch solves this problem by including the metric in memory.stat as well, but only if it is also reported in memory.current (it would also be confusing if the value was reported in memory.stat, but not in memory.current) Aside from the consistency between the two files, we also see benefits in observability. Userspace might be interested in the hugeTLB footprint of cgroups for many reasons. For instance, system admins might want to verify that hugeTLB usage is distributed as expected across tasks: i.e. memory-intensive tasks are using more hugeTLB pages than tasks that don't consume a lot of memory, or are seen to fault frequently. Note that this is separate from wanting to inspect the distribution for limiting purposes (in which case, hugeTLB controller makes more sense). 2. We already have a hugeTLB controller. Why not use that? It is true that hugeTLB tracks the exact value that we want. In fact, by enabling the hugeTLB controller, we get all of the observability benefits that I mentioned above, and users can check the total hugeTLB usage, verify if it is distributed as expected, etc. With this said, there are 2 problems: (a) They are still not reported in memory.stat, which means the disparity between the memcg reports are still there. (b) We cannot reasonably expect users to enable the hugeTLB controller just for the sake of hugeTLB usage reporting, especially since they don't have any use for hugeTLB usage enforcing [2]. 3. Implementation Details: In the alloc / free hugetlb functions, we call lruvec_stat_mod_folio regardless of whether memcg accounts hugetlb. mem_cgroup_commit_charge which is called from alloc_hugetlb_folio will set memcg for the folio only if the CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING cgroup mount option is used, so lruvec_stat_mod_folio accounts per-memcg hugetlb counters only if the feature is enabled. Regardless of whether memcg accounts for hugetlb, the newly added global counter is updated and shown in /proc/vmstat. The global counter is added because vmstats is the preferred framework for cgroup stats. It makes stat items consistent between global and cgroups. It also provides a per-node breakdown, which is useful. Because it does not use cgroup-specific hooks, we also keep generic MM code separate from memcg code. [1] https://lore.kernel.org/all/20231006184629.155543-1-nphamcs@gmail.com/ [2] Of course, we can't make a new patch for every feature that can be duplicated. However, since the existing solution of enabling the hugeTLB controller is an imperfect solution that still leaves a discrepancy between memory.stat and memory.curent, I think that it is reasonable to isolate the feature in this case. Link: https://lkml.kernel.org/r/20241101204402.1885383-1-joshua.hahnjy@gmail.com Signed-off-by: Joshua Hahn Suggested-by: Nhat Pham Suggested-by: Shakeel Butt Suggested-by: Johannes Weiner Acked-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Chris Down Acked-by: Michal Hocko Reviewed-by: Roman Gushchin Reviewed-by: Nhat Pham Cc: Jonathan Corbet Cc: Michal Koutný Cc: Muchun Song Cc: Zefan Li Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ include/linux/mmzone.h | 3 +++ mm/hugetlb.c | 2 ++ mm/memcontrol.c | 11 +++++++++++ mm/vmstat.c | 3 +++ 5 files changed, 24 insertions(+) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 6d02168d78bed..02dc54fe1f530 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1655,6 +1655,11 @@ The following nested keys are defined. pgdemote_khugepaged Number of pages demoted by khugepaged. + hugetlb + Amount of memory used by hugetlb pages. This metric only shows + up if hugetlb usage is accounted for in memory.current (i.e. + cgroup is mounted with the memory_hugetlb_accounting option). + memory.numa_stat A read-only nested-keyed file which exists on non-root cgroups. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5e8f567753bdd..b36124145a16f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -220,6 +220,9 @@ enum node_stat_item { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, +#ifdef CONFIG_HUGETLB_PAGE + NR_HUGETLB, +#endif NR_VM_NODE_STAT_ITEMS }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2c8c5da0f5d32..ea2ed8e301ef2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1925,6 +1925,7 @@ void free_huge_folio(struct folio *folio) pages_per_huge_page(h), folio); hugetlb_cgroup_uncharge_folio_rsvd(hstate_index(h), pages_per_huge_page(h), folio); + lruvec_stat_mod_folio(folio, NR_HUGETLB, -pages_per_huge_page(h)); mem_cgroup_uncharge(folio); if (restore_reserve) h->resv_huge_pages++; @@ -3093,6 +3094,7 @@ struct folio *alloc_hugetlb_folio(struct vm_area_struct *vma, if (!memcg_charge_ret) mem_cgroup_commit_charge(folio, memcg); + lruvec_stat_mod_folio(folio, NR_HUGETLB, pages_per_huge_page(h)); mem_cgroup_put(memcg); return folio; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ed2dd88437cab..7b3503d12aaf1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -315,6 +315,9 @@ static const unsigned int memcg_node_stat_items[] = { PGDEMOTE_KSWAPD, PGDEMOTE_DIRECT, PGDEMOTE_KHUGEPAGED, +#ifdef CONFIG_HUGETLB_PAGE + NR_HUGETLB, +#endif }; static const unsigned int memcg_stat_items[] = { @@ -1366,6 +1369,9 @@ static const struct memory_stat memory_stats[] = { { "unevictable", NR_UNEVICTABLE }, { "slab_reclaimable", NR_SLAB_RECLAIMABLE_B }, { "slab_unreclaimable", NR_SLAB_UNRECLAIMABLE_B }, +#ifdef CONFIG_HUGETLB_PAGE + { "hugetlb", NR_HUGETLB }, +#endif /* The memory events */ { "workingset_refault_anon", WORKINGSET_REFAULT_ANON }, @@ -1461,6 +1467,11 @@ static void memcg_stat_format(struct mem_cgroup *memcg, struct seq_buf *s) for (i = 0; i < ARRAY_SIZE(memory_stats); i++) { u64 size; +#ifdef CONFIG_HUGETLB_PAGE + if (unlikely(memory_stats[i].idx == NR_HUGETLB) && + !(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_HUGETLB_ACCOUNTING)) + continue; +#endif size = memcg_page_state_output(memcg, memory_stats[i].idx); seq_buf_printf(s, "%s %llu\n", memory_stats[i].name, size); diff --git a/mm/vmstat.c b/mm/vmstat.c index 11a37c528395e..4d016314a56c9 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1273,6 +1273,9 @@ const char * const vmstat_text[] = { "pgdemote_kswapd", "pgdemote_direct", "pgdemote_khugepaged", +#ifdef CONFIG_HUGETLB_PAGE + "nr_hugetlb", +#endif /* system-wide enum vm_stat_item counters */ "nr_dirty_threshold", "nr_dirty_background_threshold", From f364cdeb38938f9d03061682b8ff3779dd1730e5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 8 Nov 2024 18:01:47 +0800 Subject: [PATCH 213/215] zram: fix NULL pointer in comp_algorithm_show() LTP reported a NULL pointer dereference as followed: CPU: 7 UID: 0 PID: 5995 Comm: cat Kdump: loaded Not tainted 6.12.0-rc6+ #3 Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : __pi_strcmp+0x24/0x140 lr : zcomp_available_show+0x60/0x100 [zram] sp : ffff800088b93b90 x29: ffff800088b93b90 x28: 0000000000000001 x27: 0000000000400cc0 x26: 0000000000000ffe x25: ffff80007b3e2388 x24: 0000000000000000 x23: ffff80007b3e2390 x22: ffff0004041a9000 x21: ffff80007b3e2900 x20: 0000000000000000 x19: 0000000000000000 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000000 x10: ffff80007b3e2900 x9 : ffff80007b3cb280 x8 : 0101010101010101 x7 : 0000000000000000 x6 : 0000000000000000 x5 : 0000000000000040 x4 : 0000000000000000 x3 : 00656c722d6f7a6c x2 : 0000000000000000 x1 : ffff80007b3e2900 x0 : 0000000000000000 Call trace: __pi_strcmp+0x24/0x140 comp_algorithm_show+0x40/0x70 [zram] dev_attr_show+0x28/0x80 sysfs_kf_seq_show+0x90/0x140 kernfs_seq_show+0x34/0x48 seq_read_iter+0x1d4/0x4e8 kernfs_fop_read_iter+0x40/0x58 new_sync_read+0x9c/0x168 vfs_read+0x1a8/0x1f8 ksys_read+0x74/0x108 __arm64_sys_read+0x24/0x38 invoke_syscall+0x50/0x120 el0_svc_common.constprop.0+0xc8/0xf0 do_el0_svc+0x24/0x38 el0_svc+0x38/0x138 el0t_64_sync_handler+0xc0/0xc8 el0t_64_sync+0x188/0x190 The zram->comp_algs[ZRAM_PRIMARY_COMP] can be NULL in zram_add() if comp_algorithm_set() has not been called. User can access the zram device by sysfs after device_add_disk(), so there is a time window to trigger the NULL pointer dereference. Move it ahead device_add_disk() to make sure when user can access the zram device, it is ready. comp_algorithm_set() is protected by zram->init_lock in other places and no such problem. Link: https://lkml.kernel.org/r/20241108100147.3776123-1-liushixin2@huawei.com Fixes: 7ac07a26dea7 ("zram: preparation for multi-zcomp support") Signed-off-by: Liu Shixin Reviewed-by: Sergey Senozhatsky Cc: Jens Axboe Cc: Minchan Kim Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index cee49bb0126d9..3dee026988dc8 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2572,6 +2572,8 @@ static int zram_add(void) zram->disk->private_data = zram; snprintf(zram->disk->disk_name, 16, "zram%d", device_id); atomic_set(&zram->pp_in_progress, 0); + zram_comp_params_reset(zram); + comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); /* Actual capacity set using sysfs (/sys/block/zram/disksize */ set_capacity(zram->disk, 0); @@ -2579,9 +2581,6 @@ static int zram_add(void) if (ret) goto out_cleanup_disk; - zram_comp_params_reset(zram); - comp_algorithm_set(zram, ZRAM_PRIMARY_COMP, default_compressor); - zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; From 811808d365398680b628d2b88aafeba77c88691a Mon Sep 17 00:00:00 2001 From: Nirjhar Roy Date: Fri, 18 Oct 2024 23:16:01 +0530 Subject: [PATCH 214/215] mm/kfence: add a new kunit test test_use_after_free_read_nofault() Faults from copy_from_kernel_nofault() need to be handled by fixup table and should not be handled by kfence. Otherwise while reading /proc/kcore which uses copy_from_kernel_nofault(), kfence can generate false negatives. This can happen when /proc/kcore ends up reading an unmapped address from kfence pool. Let's add a testcase to cover this case. Link: https://lkml.kernel.org/r/210e561f7845697a32de44b643393890f180069f.1729272697.git.ritesh.list@gmail.com Signed-off-by: Nirjhar Roy Co-developed-by: Ritesh Harjani (IBM) Signed-off-by: Ritesh Harjani (IBM) Tested-by: Marco Elver Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kfence/kfence_test.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/mm/kfence/kfence_test.c b/mm/kfence/kfence_test.c index 00fd17285285b..f65fb182466d5 100644 --- a/mm/kfence/kfence_test.c +++ b/mm/kfence/kfence_test.c @@ -383,6 +383,22 @@ static void test_use_after_free_read(struct kunit *test) KUNIT_EXPECT_TRUE(test, report_matches(&expect)); } +static void test_use_after_free_read_nofault(struct kunit *test) +{ + const size_t size = 32; + char *addr; + char dst; + int ret; + + setup_test_cache(test, size, 0, NULL); + addr = test_alloc(test, size, GFP_KERNEL, ALLOCATE_ANY); + test_free(addr); + /* Use after free with *_nofault() */ + ret = copy_from_kernel_nofault(&dst, addr, 1); + KUNIT_EXPECT_EQ(test, ret, -EFAULT); + KUNIT_EXPECT_FALSE(test, report_available()); +} + static void test_double_free(struct kunit *test) { const size_t size = 32; @@ -780,6 +796,7 @@ static struct kunit_case kfence_test_cases[] = { KFENCE_KUNIT_CASE(test_out_of_bounds_read), KFENCE_KUNIT_CASE(test_out_of_bounds_write), KFENCE_KUNIT_CASE(test_use_after_free_read), + KFENCE_KUNIT_CASE(test_use_after_free_read_nofault), KFENCE_KUNIT_CASE(test_double_free), KFENCE_KUNIT_CASE(test_invalid_addr_free), KFENCE_KUNIT_CASE(test_corruption), From 2532e6c74a67e65b95f310946e0c0e0a41b3a34b Mon Sep 17 00:00:00 2001 From: "Ritesh Harjani (IBM)" Date: Wed, 13 Nov 2024 19:49:54 +0530 Subject: [PATCH 215/215] cma: enforce non-zero pageblock_order during cma_init_reserved_mem() cma_init_reserved_mem() checks base and size alignment with CMA_MIN_ALIGNMENT_BYTES. However, some users might call this during early boot when pageblock_order is 0. That means if base and size does not have pageblock_order alignment, it can cause functional failures during cma activate area. So let's enforce pageblock_order to be non-zero during cma_init_reserved_mem() to catch such wrong usages. 1. This was seen with fadump on PowerPC which was calling cma_init_reserved_mem() before the pageblock_order was initialized. This is now fixed in the fadump on PowerPC itself. The details of that can be found in the patch including the userspace-visible effect of the issue [1]. 2. However it was also decided that we should add a stronger enforcement check within cma_init_reserved_mem() to catch such wrong usages [2]. Hence this patch. This is ok to be in -next and there is no "Fixes" tag required for this patch. [1]: https://lore.kernel.org/all/3ae208e48c0d9cefe53d2dc4f593388067405b7d.1729146153.git.ritesh.list@gmail.com/ [2]: https://lore.kernel.org/all/83eb128e-4f06-4725-a843-a4563f246a44@redhat.com/ Link: https://lkml.kernel.org/r/e274344b44d5f80fa54c52f530387257fe99ec65.1731505681.git.ritesh.list@gmail.com Signed-off-by: Ritesh Harjani (IBM) Acked-by: David Hildenbrand Acked-by: Zi Yan Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/cma.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/cma.c b/mm/cma.c index c5869d0001ad1..de5bc0c81fc23 100644 --- a/mm/cma.c +++ b/mm/cma.c @@ -181,6 +181,15 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size, if (!size || !memblock_is_region_reserved(base, size)) return -EINVAL; + /* + * CMA uses CMA_MIN_ALIGNMENT_BYTES as alignment requirement which + * needs pageblock_order to be initialized. Let's enforce it. + */ + if (!pageblock_order) { + pr_err("pageblock_order not yet initialized. Called during early boot?\n"); + return -EINVAL; + } + /* ensure minimal alignment required by mm core */ if (!IS_ALIGNED(base | size, CMA_MIN_ALIGNMENT_BYTES)) return -EINVAL;